Home Page

Read in data

library(ggplot2) # for graphics
## Warning: package 'ggplot2' was built under R version 4.1.2
library(MASS) # for maximum likelihood estimation

starfish <- read.table("C:/Users/andre/Desktop/Comp-Bio_381/TableS3_FigS3_data.csv",header=TRUE,sep=",")
str(starfish)
## 'data.frame':    124 obs. of  9 variables:
##  $ experiment    : chr  "one" "one" "one" "one" ...
##  $ tank          : chr  "one" "one" "one" "one" ...
##  $ species       : chr  "Evasterias" "Evasterias" "Evasterias" "Evasterias" ...
##  $ treatment     : chr  "4 Eva" "4 Eva" "4 Eva" "4 Eva" ...
##  $ arm_length    : int  39 41 49 56 33 39 48 57 40 45 ...
##  $ start_weight  : num  9.23 8.18 6.66 21.32 4.55 ...
##  $ end_weight    : num  10.28 9.54 7.1 24.58 3.98 ...
##  $ change_weight : num  1.05 1.366 0.44 3.268 -0.576 ...
##  $ percent_change: num  11.4 16.7 6.6 15.3 -12.7 ...
summary(starfish)
##   experiment            tank             species           treatment        
##  Length:124         Length:124         Length:124         Length:124        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    arm_length     start_weight       end_weight      change_weight   
##  Min.   :29.00   Min.   :  4.553   Min.   :  3.977   Min.   :-9.600  
##  1st Qu.:45.00   1st Qu.: 16.338   1st Qu.: 17.745   1st Qu.: 0.600  
##  Median :54.00   Median : 25.637   Median : 27.392   Median : 1.863  
##  Mean   :53.68   Mean   : 33.562   Mean   : 36.507   Mean   : 2.945  
##  3rd Qu.:62.00   3rd Qu.: 46.340   3rd Qu.: 50.475   3rd Qu.: 4.622  
##  Max.   :78.00   Max.   :128.400   Max.   :127.000   Max.   :24.400  
##  percent_change   
##  Min.   :-15.812  
##  1st Qu.:  1.914  
##  Median :  8.384  
##  Mean   : 11.690  
##  3rd Qu.: 17.220  
##  Max.   :100.000

Plotting Histogram Data

p1 <- ggplot(data=starfish, aes(x=arm_length, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Add empirical density curve

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)  
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Get maximum likelihood parameters for normal

normPars <- fitdistr(starfish$arm_length,"normal")
print(normPars)
##       mean          sd    
##   53.6774194   11.5643699 
##  ( 1.0385111) ( 0.7343382)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 53.7 11.6
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 1.039 0.734
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 1.079 0 0 0.539
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 124
##  $ loglik  : num -479
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##     mean 
## 53.67742
normPars$estimate["sd"]
##       sd 
## 11.56437

Plot normal probability density

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(starfish$arm_length),len=length(starfish$arm_length))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(starfish$arm_length), args = list(mean = meanML, sd = sdML))
 p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot exponential probability density

expoPars <- fitdistr(starfish$arm_length,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(starfish$arm_length), args = list(rate=rateML))
 p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot uniform probability density

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(starfish$arm_length), args = list(min=min(starfish$arm_length), max=max(starfish$arm_length)))
 p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot gamma probability density

gammaPars <- fitdistr(starfish$arm_length,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(starfish$arm_length), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot beta probability density

pSpecial <- ggplot(data=starfish, aes(x=arm_length/(max(arm_length + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=starfish$arm_length/max(starfish$arm_length + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(starfish$arm_length), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

What distribution best fits this dataset?:

The best fitting distribution for this dataset is the gamma distribution, although the normal distribution is close as well.

Simulate a new dataset.

newdata <- rgamma(n=(length(starfish$arm_length)),shape=shapeML, rate=rateML)
newdata <- data.frame(1:(length(starfish$arm_length)),newdata)
names(newdata) <- list("ID","myVar")
newdata <- newdata[newdata$myVar>0,]
str(newdata)
## 'data.frame':    124 obs. of  2 variables:
##  $ ID   : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ myVar: num  39.7 42.7 62.8 88.2 46.2 ...
summary(newdata$myVar)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   27.99   44.70   50.15   51.62   58.19   88.18
p2 <- ggplot(data=newdata, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.