```source("http://macosa.dima.unige.it/r.R")    # If I have not already loaded the library
---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
# How to assess the discrepancy between the results of n tests, classified in nc
#  classes, and a certain distribution law? An idea is to consider:
# χ² = Σ i = 1 … nc (ObsFri - ExpFri)²/ ExpFri  (Observed and Expected Frequencies)
# Consider, for example, 50 throws of a dice game; I got 9 one, 11 two, 5 three, 8 four,
# 10 five and 7 six. In order to evaluate the discordance from the uniform distribution
# (corresponding to a balanced die), we calculate its χ².

ObsFr = c(9,11,5,8,10,7)
Prob = c(1/6,1/6,1/6,1/6,1/6,1/6); n = sum(ObsFr)
ExpFr = Prob*n; chi2 = sum( (ObsFr-ExpFr)^2/ExpFr ); chi2
# 2.8
# To evaluate the equity of the die we compare this value with the theoretical distri-
# bution of χ² corresponding to the balanced die:

Prob = rep(1/6,6); ExpFr = 50*Prob
x = NULL; y = NULL; for (n in 1:1e4) {
for(i in 1:6) y[i]=0
for (k in 1:50) {u=RUNIF(1, 1,6); y[u]=y[u]+1}
# I got random 50 outputs and I classified them
x[n] = sum((y-ExpFr)^2/ExpFr) }
summary(x)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#  0.160   2.800   4.240   4.979   6.640  27.280
# I choose the scale of the chart:
h=max(hist(x,probability=TRUE,right=FALSE,col="grey",n=30,plot=FALSE)\$density );h
# 0.1706
BF=3.5; HF=3
Plane(0,28, 0,0.18)        # or:  Plane(0, max(x)*1.05, 0, h*1.05)
POINT(2.8,0,"blue")     # below on the left the hist. of theoretical χ² and our value # I see that 2.8 is an almost central value (it corresponds to the 1st quartile). So I
# can consider plausible (ie do not refuse) the hypothesis that the die is balanced.

# Usually a procedure is used that does not depend on the particular phenomenon
# considered: if the number of tests is large every theoretical distribution of χ²
# is approximated by a distribution χ²(r) dependent only on degrees of freedom r, that
# is the amount of experimental frequencies that I have to know directly.
# In R the density of χ²(r) is x -> dchisq(x,r).

# In the previous case the degrees of freedom are 5: if I know 5 relative frequencies,
# I can find the 6th as the difference between 1 and the sum of them.

f = function(x) dchisq(x,5); graph(f, 0,28, "brown")
# I get the chart above to the right.

# The point which is the maximun of f:
maxmin(f, 0,10); f(last())
#    3          0.1541803

# qchisq(x,r) and pchisq(x,r) give the Kth (x=K/100) 100-quantile (or percentile) and
# the probability that χ² < x:

graph2F(f, 0,28, "red"); POINT(3,0.1541803, "blue")
qchisq(0.75, 5); Q = last()
# 6.62568
integral(f, 0,Q)
# 0.75
pchisq(Q, 5)
# 0.75
POINT(Q,f(Q),"seagreen"); diseq(z,f, 0,Q, "brown") # Here are the values for different degrees of freedom:
perc = c(5,10,25,50,75,90,95); tit = c("d.f.",perc); df = c(1,2,3,4,5,10,20,50,100)
{ write(tit,file="",sep="\t",ncolumns=8);
for (g in df) write( c(g,signif(qchisq(perc/100,g),3)),file="",sep="\t",ncolumns=8) }
# d.f.    5       10      25      50      75      90      95
# 1       0.00393 0.0158  0.102   0.455   1.32    2.71    3.84
# 2       0.103   0.211   0.575   1.39    2.77    4.61    5.99
# 3       0.352   0.584   1.21    2.37    4.11    6.25    7.81
# 4       0.711   1.06    1.92    3.36    5.39    7.78    9.49
# 5       1.15    1.61    2.67    4.35    6.63    9.24    11.1
# 10      3.94    4.87    6.74    9.34    12.5    16      18.3
# 20      10.9    12.4    15.5    19.3    23.8    28.4    31.4
# 50      34.8    37.7    42.9    49.3    56.3    63.2    67.5
# 100     77.9    82.4    90.1    99.3    109     118     124
#    6.63  is the value of  qchisq(0.75, 5)

# Returning to our die, if as χ² instead of 2.8 I would have got 13 I would have grave
# doubts about the fact that the die is balanced: the 95th percentile is 11.1:
qchisq(0.95, 5)
# 11.0705
# 13 corresponds to the 98th percentile:
pchisq(13, 5)
# 0.9766212
# But alse if I would have got 0.4 I would have grave doubts about the fact that the
# results of the tests are effective: the 5th percentile is 1.1:
qchisq(0.05,5)
# 1.145476
# For example if I got 9 one, 8 two, 8 three, 7 four, 9 five and 9 six  χ² is 0.4:
ObsFr = c(9,8,8,7,9,9); ExpFr = Prob*n; sum( (ObsFr-ExpFr)^2/ExpFr )
# 0.4
# It is useful (and convenient) to compare these values (2.8 and 0.4) with the table:
p = c( 5, 10, 25, 50, 75, 90, 95 )/100;  r = 5; print(qchisq(p,r), 3)
#   1.15  1.61  2.67  4.35  6.63  9.24  11.07          (see also the previous table)
#
# Another example.
# An investigation of the weight of males between 45 and 55 years of a certain nation: # The sample is 825 males, in 11 classes
freq = c(4,39,122,197,192,126,93,33,16,2,1); l=length(freq); n=sum(freq); l; n
#  11   825
interv = seq(53.5, 130.5, 7)
val = seq(57, 127, 7)
noClass=1; histoclas(interv, freq)
# We get the chart above. To have an histogram of area 1 I use "dataclas" (see):
hist(dataclas,interv,col="grey",main="",cex.axis=0.8,probability=TRUE)
BOX()
# We get the chart below.
# Is it acceptable that the distribution be gaussian with the same mean and standard
# deviation? m = mean(dataclas); s = sd(dataclas); m; s
#   84.29633            11.68259
z <- function(x) dnorm(x,mean=m,sd=s); graph(z, 50,140, "brown")

# It seems that the curve fits the histogram, but ... the data is numerous ...

# How many degrees of freedom are there in this case?
# 11 classes; the imposed connections are 3: the number of classes, the mean and the
# standard deviation; the degrees of freedom are 11-3 = 8.
# The probabilities are calculated using integrals:
K <- function(i) integral(z,53.5+i*7,53.5+(i+1)*7)
Prob <- c(K(0),K(1),K(2),K(3),K(4),K(5),K(6),K(7),K(8),K(9),K(10))
ExpFr = Prob*n; ObsFr = freq; sum( (ObsFr-ExpFr)^2/ExpFr )
#  27.07139
# We compare this value with the table:
p = c( 5, 10, 25, 50, 75, 90, 95 )/100;  r = 8; print(qchisq(p,r),3)
#  2.73  3.49  5.07  7.34  10.22  13.36  15.51
# Clearly we must discard the hypothesis that the distribution is Gaussian!
#
# A further example:
# I throw a coin 100 times. I get 65 heads. Can I consider the coin fair?
# 1 degree of freedom.
p = c( 5, 10, 25, 50, 75, 90, 95 )/100;  r = 1; print(qchisq(p,r),3)
# 0.00393 0.01579 0.10153 0.45494 1.32330 2.70554 3.84146
ObsFr = c(65,35); ExpFr = c(50,50); sum( (ObsFr-ExpFr)^2/ExpFr )
# 9
# 9 is much larger than 95th percentile (3.84146); I can not consider the coin balanced.
# If I want the percentile:
pchisq(9, 1)
# 0.9973002
#
# One last example:
# to study the distribution of the error of the measurements detected by means of a
# radio altimeter, 400 measurements are performed, obtaining the following results:
# interval   [20,30) [30,40) [40,50) [50,60) [60,70) [70,80) [80,90) [90,100)
# frequency     21      72      66      38      51      56      64      32
interv = c(20,30,40,50,60,70,80,90,100)
freq =     c(21,72,66,38,51,56,64,32);  n = 400
BF=3.5; HF=2.5
histoclas(interv,freq)                 # See here for the histograms of classified data
# The mean (brown dot) is about  60.24998. # We verify (by the test χ²) the conformity of the uniform distribution with the
# experimental distribution. We do not know the interval of uniform distribution. Let
# it be [a,b). We can calculate the mean and variance of the data, and express them as
# a function of a and b (and then derive the values of a and b).
M = mean(dataclas); V = Var(dataclas); M; V
#     60.24998            457.2783
# In the case of uniform distribution, M = (a+b)/2, V = (b-a)^2/12, or:
# a + b = 2*M,  b - a = sqrt(V*12).
b = (sqrt(V*12)+2*M)/2; a = (2*M-sqrt(V*12))/2; a; b
#   23.21169                97.28827
line(a,1, b,1, "blue")
L = b-a; pr=NULL; pr = (30-a)/L; pr = (b-90)/L; for(i in 2:7) pr[i] = 10/L
pr
#  0.09163906 0.13499544 0.13499544 0.13499544 0.13499544 0.13499544 0.13499544 0.09838829
ExpFr = n*pr; chi2 = sum((freq-ExpFr)^2/(ExpFr)); chi2
# 23.56327
# The degrees of freedom are 8-3 = 5 (I take 3 instead of 1 because I have also imposed
# mean and variance, or a and b).
p = c( 5, 10, 25, 50, 75, 90, 95 )/100;  r = 5; print(qchisq(p,r), 3)
#  1.15  1.61  2.67  4.35  6.63  9.24 11.07
# 23.6 is well beyond the 95th percentile. I therefore reject the hypothesis of
# uniformity.

# Now, we consider problems in which each member of a population can be classified
# according to two distinct characteristics. An example.
# Consider the following data (processed by an article of Spencer Russell, 1926);
# frequencies of sunset colors and whether each was followed by rain are presented:
#  Sky color  Number of observations  Number followed by rain
#  Red                     61               26
#  Mainly red             198               53
#  Yellow                 159               81
#  Mainly yellow          188               86
#  Red and yellow         190               51
#  Gray                   302              167
# I want to value the hypothesis that whether it rains tomorrow is independent of the
# color of today's sunset. There are various tests to do this, but they are not easy to
# interpret. A graphical representation is enough:
days = c(61,198,159,188,190,302)
rain = c(26, 53, 81, 86, 51,167)
BF=4.5; HF=3.5; Plane(0,320, 0, 180); POINT(days,rain,"brown")
abovex("observations"); abovey("days followed by rain")
LR0(days,rain,0,0)
# 0   0.4385932  <- LR01  LR02   y=LR01+LR02*x
f = function(x) LR01+LR02*x
graph1(f,0,330, "grey") # An examination of the graph shows that there is no dependence between the two phenomena.
# All data pairs, in a way that is practically independent of the sky conditions the night
# before, are arranged more or less along a straight line passing through the origin. The
# proverb "red sky at night, good weather in sight" has no foundation.

# There are many other tests. You can explore them with this command (then you must
# click "Search Engine & Keywords" and type "test" in the text field)
help.start()

# Here you can see same examples of using Student's t-test and other tests.