library(ggplot2)
g <- ggplot(data = data.frame(Frequency = c(108, 34, 40), Color = c('Red', 'Pink', 'White')), aes(x = Color, y = Frequency)) +
    geom_bar(stat="identity") +
    geom_text(aes(label = Frequency), vjust = 1.6, color = "white", size = 5)+
    theme_bw() +
    theme(text = element_text(size = 20))
options(repr.plot.width=10, repr.plot.height=5)
g


g <- ggplot(data = data.frame(x = c(47.3, 40.0, 44.6, 25.5, 41.8, 59.4, 9.2)), aes(x = x)) +
    geom_dotplot(binwidth = 1) +
    scale_x_continuous(name = 'Infant mortality rate', breaks = seq(10, 60, 10)) +
    scale_y_continuous(NULL, breaks = NULL) +
    theme_bw() +
    theme(text = element_text(size = 20), aspect.ratio=1/5)
options(repr.plot.width=10, repr.plot.height=5)
g


g1 <- ggplot(data = data.frame(Frequency = c(108, 34, 40), Color = c('Red', 'Pink', 'White')), aes(x = Color, y = Frequency / sum(Frequency))) +  
    geom_bar(stat="identity") + 
    geom_text(aes(label = round(Frequency / sum(Frequency), 2)), vjust = 1.6, color = "white", size = 5)+
    labs(y = 'Relative frequency') +
    theme_bw() +
    theme(text = element_text(size = 20))
g2 <- ggplot(data = data.frame(Frequency = c(108, 34, 40), Color = c('Red', 'Pink', 'White')), aes(x = Color, y = Frequency / sum(Frequency))) +  
    geom_bar(stat="identity") + 
    geom_text(aes(label = paste0(100 * round(Frequency / sum(Frequency), 2), '%')), vjust = 1.6, color = "white", size = 5)+
    scale_y_continuous(name = 'Percent frequency', labels=scales::percent) +
    theme_bw() +
    theme(text = element_text(size = 20))
library(patchwork)
options(repr.plot.width=8, repr.plot.height=4)
g1 + g2


library(isdals)
data(fev)
fev$Gender <- ifelse(fev$Gender == 0, 'Female', 'Male')
table(fev$Gender)
g <- ggplot(data = fev, aes(x = FEV, y = ..density..)) +
    geom_histogram(color="black", fill = 'white', bins = 10) + # can also change bins to obtain finer or coarser histograms
    labs(x = "Forced expiratory volume (liters)", y = "Relative frequency") +
    facet_wrap(~Gender) +
    theme_bw() +
    theme(text = element_text(size = 20))
options(repr.plot.width=8, repr.plot.height=4)
g

Female   Male 
   318    336


g <- ggplot(data = fev, aes(x = FEV, y = ..density..)) +
    geom_histogram(color="black", fill = 'white', bins = 15) + # can also change bins to obtain finer or coarser histograms
    geom_density(adjust = 1.5) +
    #geom_vline(aes(xintercept = mean(FEV)), col = 'orange')+
    #geom_vline(aes(xintercept = median(FEV)), col = 'skyblue')+
    labs(x = "Forced expiratory volume (liters)", y = "Relative frequency") +
    facet_wrap(~Gender) +
    theme_bw() +
    theme(text = element_text(size = 20))
options(repr.plot.width=8, repr.plot.height=4)
g


# Normal distribution
g1 <- ggplot(data = data.frame(x = rnorm(1000)), aes(x = x, y = after_stat(density))) +
    geom_histogram(color="black", fill = 'white', bins = 20) +
    geom_density() +
    geom_vline(aes(xintercept = mean(x)), col = 'orange') +
    geom_vline(aes(xintercept = median(x)), col = 'blue', linetype = 'longdash') +
    labs(x = "", y = "Relative frequency") +
    theme_bw() +
    theme(text = element_text(size = 20))
# Gamma distribution
g2 <- ggplot(data = data.frame(x = rgamma(1000, shape = 0.6)), aes(x = x, y = after_stat(density))) +
    geom_histogram(color="black", fill = 'white', bins = 20) +
    geom_density() +
    geom_vline(aes(xintercept = mean(x)), col = 'orange') +
    geom_vline(aes(xintercept = median(x)), col = 'blue', linetype = 'longdash') +
    labs(x = "", y = "Relative frequency") +
    theme_bw() +
    theme(text = element_text(size = 20))
# Beta distribution
g3 <- ggplot(data = data.frame(x = rbeta(1000, shape1 = 8, shape2 = 1)), aes(x = x, y = after_stat(density))) +
    geom_histogram(color="black", fill = 'white', bins = 20) +
    geom_density() +
    geom_vline(aes(xintercept = mean(x)), col = 'orange') +
    geom_vline(aes(xintercept = median(x)), col = 'blue', linetype = 'longdash') +
    labs(x = "", y = "Relative frequency") +
    theme_bw() +
    theme(text = element_text(size = 20))


# orange solid: mean
# blue dashed: median
library(patchwork)
options(repr.plot.width=20, repr.plot.height=5)
g1 + g2 + g3


# Create a data frame with the bivariate frequency table data
data <- data.frame(
    Diet_Type = c("Plant-based", "Plant-based", "Animal-based", "Animal-based"),
    Health_Condition = c("Yes", "No", "Yes", "No"),
    Frequency = c(20, 35, 45, 30)
)

# Create the stacked bar chart
g <- ggplot(data, aes(x = Diet_Type, y = Frequency, fill = Health_Condition)) +
    geom_bar(stat = "identity") +
    labs(x = "Diet type", y = "Frequency", fill = "Health condition") +
    theme_bw() +
    theme(text = element_text(size = 20))
options(repr.plot.width=10, repr.plot.height=5)
g


# Calculate relative frequencies within each diet type
data <- transform(data, Relative_Frequency = Frequency / tapply(Frequency, Diet_Type, sum)[Diet_Type])

# Create the stacked relative frequency bar chart
g <- ggplot(data, aes(x = Diet_Type, y = Relative_Frequency, fill = Health_Condition)) +
    geom_bar(stat = "identity") +
    labs(x = "Diet type", y = "Relative frequency", fill = "Health condition") +
    theme_bw() +
    theme(text = element_text(size = 20))
options(repr.plot.width=10, repr.plot.height=5)
g


library(MASS)

g <- ggplot(data = Cars93, aes(x = Weight, y = MPG.city)) +
    geom_point() +
    labs(title = "Scatterplot of Weight of Car vs City MPG",
             x = "Weight of car (in pounds)",
             y = "City miles per gallon")+
    theme_bw() +
    theme(text = element_text(size = 20))
options(repr.plot.width=8, repr.plot.height=4)
g


# Generate x values for the density plot
x <- seq(-4, 4, by = 0.01)

# Calculate the density values for the standard normal distribution
density <- dnorm(x)

# Create a data frame with x and density values
data <- data.frame(x = x, density = density)

# Create the plot using ggplot2
g <- ggplot(data, aes(x = x, y = density)) +
    geom_line() +
    labs(x = "x", y = "Density", title = "Standard Normal Distribution") +
    geom_vline(xintercept = c(-1, 1), col = "red", linetype = "dashed") +
    geom_vline(xintercept = c(-2, 2), col = "blue", linetype = "dashed") +
    geom_vline(xintercept = c(-3, 3), col = "green", linetype = "dashed") +
    scale_x_continuous(breaks = seq(-4, 4, by = 1)) +
    scale_y_continuous(limits = c(0, 0.45)) +
    theme_bw() +
    theme(text = element_text(size = 20))


options(repr.plot.width=10, repr.plot.height=5)
g

Country	Infant mortality rate (deaths per 1,000 live births)
Bangladesh	47.3
Bhutan	40.0
India	44.6
Maldives	25.5
Nepal	41.8
Pakistan	59.4
Sri Lanka	9.2

Observation	Deviation	Squared deviation
76
72
65
70
82
Sum

Blood type	Frequency
A	1,634
B	327
AB	119
O	1,616
Total	3,696

Measure	Sample value (statistics)	Population value (parameter)
Proportion	$\hat{p}$	$p$
Mean	$\bar{y}$	$\mu$
Standard deviation	$s$	$\sigma$

Color	Frequency (number of plants)
Pink	34
Red	108
White	40
Total	182

Color	Frequency (number of plants)	Relative frequency	Percent frequency
Pink	34	.19	19
Red	108	.59	59
White	40	.22	22
Total	182	1.00	100

Diet Type	Health Condition: Yes	Health Condition: No
Plant-based	20	35
Animal-based	45	30

Description of Samples and Populations¶

Introduction¶

Frequency Distributions¶

Example: color of poinsettias¶

Example: infant mortality¶

Relative frequency¶

Grouped frequency distributions and histograms¶

Example: forced expiratory volume in children¶

Shapes of distributions¶

How to tell if a distribution is left skewed or right skewed¶

Descriptive Statistics: Measures of Center¶

Median $\tilde{y}$¶

Example: weight gain of lambs¶

Mean $\bar{y}$¶

Example: weight gain of lambs¶

Robustness¶

Median v.s mean¶

Boxplots¶

Quartile and the interquartile range¶

Example: blood pressure¶

Interquartile range¶

Outliers¶

Example: radish growth in light¶

Boxplots for data with no outliers¶

Boxplots for data with outliers¶

How to read boxplots?¶

Percentiles and quantiles¶

Relationship between Variables¶

Categorical-categorical relationships¶

Numeric-categorical relationships¶

Numeric-numeric relationships¶

Measures of Dispersion¶

The range¶

The standard deviation¶

Example: chrysanthemum growth¶

Interpretation of the definition of $s$¶

The Empirical Rule¶

Effect of Transformation of Variables¶

Linear transformations¶

Nonlinear transformations¶

Statistical Inference¶

Example: blood types¶

Describing a population¶

Proportions¶

Mean and SD¶