How to approximate the average?
               How many classes do you take to build a histogram?

                        How to approximate the average?

source("http://macosa.dima.unige.it/r.R")    # If I have not already loaded the library
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
If you have data approximated to the same digit (units, tenths, ...) their average can
be approximated to the next digit (tenths, hundredths, ...) if the data is at least ten,
to the second successive digit (hundredths, thousandths, ... ) if the data is at least
a thousand, to the third successive digit (thousandths, ten-thousandths, ...) if the
data is at least a hundred thousand, ... [adding 1/2 units corresponding to the final
figures of the original data if this was truncated].
An example:
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
# The lengths of many broad beans (ie bean seeds) collected by a 12-year-old class.
# The bean seeds (taken with photographs placed on a graph paper) are expressed in cm.
                            
beans = c(
1.35,1.65,1.80,1.40,1.65,1.80,1.40,1.65,1.85,1.40,1.65,1.85,1.50,1.65,1.90,
1.50,1.65,1.90,1.50,1.65,1.90,1.50,1.70,1.90,1.50,1.70,1.90,1.50,1.70,2.25,
1.55,1.70,1.55,1.70,1.55,1.70,1.60,1.70,1.60,1.75,1.60,1.75,1.60,1.80,1.60,
1.80,1.60,1.80,1.60,1.80,1.00,1.55,1.70,1.75,1.30,1.55,1.70,1.75,1.40,1.60,
1.70,1.75,1.40,1.60,1.70,1.80,1.40,1.60,1.70,1.80,1.40,1.60,1.70,1.80,1.40,
1.60,1.70,1.80,1.40,1.60,1.70,1.80,1.40,1.60,1.70,1.80,1.40,1.60,1.70,1.80,
1.45,1.60,1.70,1.80,1.50,1.60,1.70,1.80,1.50,1.60,1.70,1.85,1.50,1.60,1.70,
1.85,1.50,1.60,1.75,1.90,1.50,1.60,1.75,1.90,1.50,1.65,1.75,1.90,1.55,1.65,
1.75,1.95,1.55,1.65,1.75,2.00,1.55,1.65,1.75,2.30,1.35,1.65,1.80,1.40,1.65,
1.80,1.40,1.65,1.85,1.40,1.65,1.85,1.50,1.65,1.90,1.50,1.65,1.90,1.50,1.65,
1.90,1.50,1.70,1.90,1.50,1.70,1.90,1.50,1.70,2.25,1.55,1.70,1.55,1.70,1.55,
1.70,1.60,1.70,1.60,1.75,1.60,1.75,1.60,1.80,1.60,1.80,1.60,1.80,1.60,1.80,
1.00,1.55,1.70,1.75,1.30,1.55,1.70,1.75,1.40,1.60,1.70,1.75,1.40,1.60,1.70,
1.80,1.40,1.60,1.70,1.80,1.40,1.60,1.70,1.80,1.40,1.60,1.70,1.80,1.40,1.60,
1.70,1.80,1.40,1.60,1.70,1.80,1.40,1.60,1.70,1.80,1.45,1.60,1.70,1.80,1.50,
1.60,1.70,1.80,1.50,1.60,1.70,1.85,1.50,1.60,1.70,1.85,1.50,1.60,1.75,1.90,
1.50,1.60,1.75,1.90,1.50,1.65,1.75,1.90,1.55,1.65,1.75,1.95,1.55,1.65,1.75,
2.00,1.55,1.65,1.75,2.30
)
n=length(beans); m=mean(beans); n;m
#  260  1.659231
# How to approximate the mean?
# The numbrer of data is between 10 and 1000. I can take between 2 and 3 digits:
round( mean(beans),2 )
# 1.66
# I take 3 digits, taking into account that on the last digit there is some uncertainty.
#
# The lengths of many basil seeds collected by a 12-year-old class. The basil seeds
# (taken with a camcorder connected to a stereo microscope) are expressed in mm.
         
basil <- c(
1.996646,2.427837,2.002445,2.032486,2.440977,2.179811,1.827547,2.122749,2.273763,
2.237457,2.234695,2.416860,1.855254,2.141668,2.274085,2.148191,2.188731,2.279401,
1.861674,2.148191,2.277117,1.907743,2.151697,2.149251,1.874470,2.149251,2.279401,
1.885252,2.309115,2.479710,1.883268,2.151697,2.302933,1.979976,2.353246,2.231072,
1.885252,2.176491,2.309115,1.861674,2.274085,2.336312,1.891458,2.178452,2.335834,
2.072091,2.302933,2.196575,1.907743,2.179811,2.336312,2.141668,2.273763,2.194292,
1.943342,2.181266,2.339914,2.348716,2.574592,1.967000,2.188731,2.348716,2.208185,
2.277117,1.975734,2.194292,2.353246,1.943342,2.238444,1.979976,2.196575,2.395220,
2.098704,2.482356,1.996646,2.204940,2.406590,2.204940,2.458355,2.002445,2.205823,
2.416860,1.883268,2.667822,2.015793,2.208185,2.427837,2.015793,2.457101,2.016699,
2.224770,2.440977,1.855254,2.395220,2.032486,2.226911,2.457101,2.052005,2.176491,
2.033379,2.231072,2.458355,2.104753,2.178452,2.045551,2.232692,2.459751,2.335834,
2.339914,2.052005,2.234695,2.479710,2.122749,2.033379,2.069424,2.237457,2.482356,
1.967000,1.975734,2.072091,2.238444,2.574592,2.267303,2.205823,2.098704,2.267303,
2.667822,2.232692,2.226911,2.104753,1.891458,2.406590,2.045551,1.827547,2.069424,
2.459751,1.874470,2.181266,2.224770,2.016699,2.602342,1.980298,2.414356,2.156164,
1.944474,2.176403,2.381037,2.665530,2.282354,1.971069,2.178466,2.389039,2.403857,
2.176403,1.980298,2.222064,2.400560,2.441692,2.256341,2.005266,2.233202,2.403857,
2.400560,2.301457,2.047079,2.256341,2.414356,2.279626,2.222064,2.063478,2.257275,
2.441692,2.389039,2.293673,2.073464,2.264273,2.441809,2.663396,2.063478,2.075890,
2.279626,2.501192,2.575395,2.264273,2.080611,2.282354,2.524316,1.944474,2.112226,
2.097085,2.288252,2.575395,2.047079,2.178466,2.112226,2.293673,2.602342,2.073464,
2.299742,2.141965,2.299742,2.654032,2.142679,2.305893,2.142679,2.301457,2.663396,
2.075890,2.144709,2.144709,2.305893,2.665530,2.080611,2.288252,2.156164,2.345941,
2.739415,2.163349,2.257275,2.161602,2.364831,2.762354,2.141965,2.161602,2.163349,
2.368598,2.005266,2.097085,1.971069,2.381037,2.345941,2.233202,2.739415,2.524316,
2.762354,2.364831,2.501192,2.654032,2.368598,2.441809
)
length(basil); mean(basil)
#   240         2.230432
# The data is between 10 and 1000. I can take between 2 and 3 digits:
round( mean(basil),2 )
# 2.23
# I take 3 digits, taking into account that on the last digit there is some uncertainty.
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------

               How many classes do you take to build a histogram?

The shape of the histogram used to represent the distribution of a set of data depends
on the choice of the number of classes. You can make attempts with different choices
and take the histogram that looks the best. There is no strict criterion, but as a
first attempt a number of classes equal to approximately the square root of the number
of data can be taken: if the distribution were uniform and a number of classes were
taken equal to the number of data, we would have a cross for each column, if we take
(Number of Classes) ≈ √(Number of Data) we have in each column about a quantity of
crosses equal to the number of classes.

                  
                                  16                      √16
I you don't choice the number of classes, R makes his choice.
The case of beans (considered above).
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
BF=4; HF=2.5; histo(beans)        # The left figure
# The right:
noClass=1; histo(beans); underx2("lengths of beans")
gridVC(seq(0,3,0.1), "black"); GridVC(seq(0,3,0.5), "black")
underY("20", 20/0.1); underY("10", 10/0.1); underY( "0",  0/0.1)
 
# How to choose the number of classes?
sqrt(length(beans))
# 16.12452
# The program had automatically chosen 13 classes. I can do this:
noClass=1; Histo(beans,0.5, 2.5, 0.1)
gridVC(seq(0,3,0.1), "black"); GridVC(seq(0,3,0.5), "black")
underY("20", 20/0.1); underY("10", 10/0.1); underY( "0",  0/0.1)
                  
# Then we can verify that this particular type of data has a Gaussian distribution:
m=mean(beans); s=Sd(beans)
z = function(x) dnorm(x, mean=m, sd=s); z1= function(x) z(x)*100
# I multiplied by 100 because the density was represented on the histogram in
# percentage form
graph2(z1,0,4, "brown")
                  
# If you want the density (non in percentage form):
dev.new()
hist(beans, n=16, right=FALSE, probability=TRUE, cex.axis=0.8, col="grey",main="")
graph2(z,0.5,2.5, "brown")
abline(h=axTicks(2),lty=3)
                
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
The case of basil (considered above).
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
noClass=1; histo(basil)
gridVC(seq(0,3,0.1), "black"); GridVC(seq(0,3,0.5), "black")
underY("20", 20/0.1); underY("10", 10/0.1); underY( "0",  0/0.1)
                   
# The program had automatically chosen 10 classes. I can do this:
noClass=1; Histo(basil,1.5, 3, 0.1); underx2("lengths of basil seeds")
gridVC(seq(0,3,0.1), "black"); GridVC(seq(0,3,0.5), "black")
underY("20", 20/0.1); underY("10", 10/0.1); underY( "0",  0/0.1)
                   
# Then we can verify that this particular type of data has a Gaussian distribution:
m=mean(basil); s=Sd(basil)
z = function(x) dnorm(x, mean=m, sd=s); z1= function(x) z(x)*100
# I multiplied by 100 because the density was represented on the histogram in
# percentage form
graph2(z1,0,4, "brown")
#
# If you want the density (non in percentage form):
# To choose the number of classes I can do:
sqrt(length(basil))
# 15.49193
dev.new()
hist(basil, n=15, right=FALSE, probability=TRUE, cex.axis=0.8, col="grey",main="")
graph2(z,1.5,3, "brown")
abline(h=axTicks(2),lty=3)
                   
# Alternative:
Plane(1.5,3, 0,2.5)
hist(basil, n=15, right=FALSE, probability=TRUE, cex.axis=0.8, col="grey",main="",add=TRUE)
graph2(z,1.5,3, "brown"); abovex("basil seeds"); abovey("density")

                   

Other examples of use