source("http://macosa.dima.unige.it/r.R") # If I have not already loaded the library ---------- ---------- ---------- ---------- ---------- ---------- ---------- ---------- # How to assess the discrepancy between the results of n tests, classified in nc # classes, and a certain distribution law? An idea is to consider: # χ² = Σ i = 1 nc (ObsFri - ExpFri)²/ ExpFri (Observed and Expected Frequencies) # Consider, for example, 50 throws of a dice game; I got 9 one, 11 two, 5 three, 8 four, # 10 five and 7 six. In order to evaluate the discordance from the uniform distribution # (corresponding to a balanced die), we calculate its χ². ObsFr = c(9,11,5,8,10,7) Prob = c(1/6,1/6,1/6,1/6,1/6,1/6); n = sum(ObsFr) ExpFr = Prob*n; chi2 = sum( (ObsFr-ExpFr)^2/ExpFr ); chi2 # 2.8 # To evaluate the equity of the die we compare this value with the theoretical distri- # bution of χ² corresponding to the balanced die: Prob = rep(1/6,6); ExpFr = 50*Prob x = NULL; y = NULL; for (n in 1:1e4) { for(i in 1:6) y[i]=0 for (k in 1:50) {u=RUNIF(1, 1,6); y[u]=y[u]+1} # I got random 50 outputs and I classified them x[n] = sum((y-ExpFr)^2/ExpFr) } summary(x) # Min. 1st Qu. Median Mean 3rd Qu. Max. # 0.160 2.800 4.240 4.979 6.640 27.280 # I choose the scale of the chart: h=max(hist(x,probability=TRUE,right=FALSE,col="grey",n=30,plot=FALSE)$density );h # 0.1706 BF=3.5; HF=3 Plane(0,28, 0,0.18) # or: Plane(0, max(x)*1.05, 0, h*1.05) hist(x, probability=TRUE,right=FALSE,col="grey",n=30,add=TRUE) POINT(2.8,0,"blue") # below on the left the hist. of theoretical χ² and our value # I see that 2.8 is an almost central value (it corresponds to the 1st quartile). So I # can consider plausible (ie do not refuse) the hypothesis that the die is balanced. # Usually a procedure is used that does not depend on the particular phenomenon # considered: if the number of tests is large every theoretical distribution of χ² # is approximated by a distribution χ²(r) dependent only on degrees of freedom r, that # is the amount of experimental frequencies that I have to know directly. # In R the density of χ²(r) is x -> dchisq(x,r). # In the previous case the degrees of freedom are 5: if I know 5 relative frequencies, # I can find the 6th as the difference between 1 and the sum of them. f = function(x) dchisq(x,5); graph(f, 0,28, "brown") # I get the chart above to the right. # The point which is the maximun of f: maxmin(f, 0,10); f(last()) # 3 0.1541803 # qchisq(x,r) and pchisq(x,r) give the Kth (x=K/100) 100-quantile (or percentile) and # the probability that χ² < x: graph2F(f, 0,28, "red"); POINT(3,0.1541803, "blue") qchisq(0.75, 5); Q = last() # 6.62568 integral(f, 0,Q) # 0.75 pchisq(Q, 5) # 0.75 POINT(Q,f(Q),"seagreen"); diseq(z,f, 0,Q, "brown") # Here are the values for different degrees of freedom: perc = c(5,10,25,50,75,90,95); tit = c("d.f.",perc); df = c(1,2,3,4,5,10,20,50,100) { write(tit,file="",sep="\t",ncolumns=8); for (g in df) write( c(g,signif(qchisq(perc/100,g),3)),file="",sep="\t",ncolumns=8) } # d.f. 5 10 25 50 75 90 95 # 1 0.00393 0.0158 0.102 0.455 1.32 2.71 3.84 # 2 0.103 0.211 0.575 1.39 2.77 4.61 5.99 # 3 0.352 0.584 1.21 2.37 4.11 6.25 7.81 # 4 0.711 1.06 1.92 3.36 5.39 7.78 9.49 # 5 1.15 1.61 2.67 4.35 6.63 9.24 11.1 # 10 3.94 4.87 6.74 9.34 12.5 16 18.3 # 20 10.9 12.4 15.5 19.3 23.8 28.4 31.4 # 50 34.8 37.7 42.9 49.3 56.3 63.2 67.5 # 100 77.9 82.4 90.1 99.3 109 118 124 # 6.63 is the value of qchisq(0.75, 5) # Returning to our die, if as χ² instead of 2.8 I would have got 13 I would have grave # doubts about the fact that the die is balanced: the 95th percentile is 11.1: qchisq(0.95, 5) # 11.0705 # 13 corresponds to the 98th percentile: pchisq(13, 5) # 0.9766212 # But alse if I would have got 0.4 I would have grave doubts about the fact that the # results of the tests are effective: the 5th percentile is 1.1: qchisq(0.05,5) # 1.145476 # For example if I got 9 one, 8 two, 8 three, 7 four, 9 five and 9 six χ² is 0.4: ObsFr = c(9,8,8,7,9,9); ExpFr = Prob*n; sum( (ObsFr-ExpFr)^2/ExpFr ) # 0.4 # It is useful (and convenient) to compare these values (2.8 and 0.4) with the table: p = c( 5, 10, 25, 50, 75, 90, 95 )/100; r = 5; print(qchisq(p,r), 3) # 1.15 1.61 2.67 4.35 6.63 9.24 11.07 (see also the previous table) # # Another example. # An investigation of the weight of males between 45 and 55 years of a certain nation: # The sample is 825 males, in 11 classes freq = c(4,39,122,197,192,126,93,33,16,2,1); l=length(freq); n=sum(freq); l; n # 11 825 interv = seq(53.5, 130.5, 7) val = seq(57, 127, 7) noClass=1; histoclas(interv, freq) # We get the chart above. To have an histogram of area 1 I use "dataclas" (see): hist(dataclas,interv,col="grey",main="",cex.axis=0.8,probability=TRUE) BOX() # We get the chart below. # Is it acceptable that the distribution be gaussian with the same mean and standard # deviation? m = mean(dataclas); s = sd(dataclas); m; s # 84.29633 11.68259 z <- function(x) dnorm(x,mean=m,sd=s); graph(z, 50,140, "brown") # It seems that the curve fits the histogram, but ... the data is numerous ... # How many degrees of freedom are there in this case? # 11 classes; the imposed connections are 3: the number of classes, the mean and the # standard deviation; the degrees of freedom are 11-3 = 8. # The probabilities are calculated using integrals: K <- function(i) integral(z,53.5+i*7,53.5+(i+1)*7) Prob <- c(K(0),K(1),K(2),K(3),K(4),K(5),K(6),K(7),K(8),K(9),K(10)) ExpFr = Prob*n; ObsFr = freq; sum( (ObsFr-ExpFr)^2/ExpFr ) # 27.07139 # We compare this value with the table: p = c( 5, 10, 25, 50, 75, 90, 95 )/100; r = 8; print(qchisq(p,r),3) # 2.73 3.49 5.07 7.34 10.22 13.36 15.51 # Clearly we must discard the hypothesis that the distribution is Gaussian! # # A further example: # I throw a coin 100 times. I get 65 heads. Can I consider the coin fair? # 1 degree of freedom. p = c( 5, 10, 25, 50, 75, 90, 95 )/100; r = 1; print(qchisq(p,r),3) # 0.00393 0.01579 0.10153 0.45494 1.32330 2.70554 3.84146 ObsFr = c(65,35); ExpFr = c(50,50); sum( (ObsFr-ExpFr)^2/ExpFr ) # 9 # 9 is much larger than 95th percentile (3.84146); I can not consider the coin balanced. # If I want the percentile: pchisq(9, 1) # 0.9973002 # # One last example: # to study the distribution of the error of the measurements detected by means of a # radio altimeter, 400 measurements are performed, obtaining the following results: # interval [20,30) [30,40) [40,50) [50,60) [60,70) [70,80) [80,90) [90,100) # frequency 21 72 66 38 51 56 64 32 interv = c(20,30,40,50,60,70,80,90,100) freq = c(21,72,66,38,51,56,64,32); n = 400 BF=3.5; HF=2.5 histoclas(interv,freq) # See here for the histograms of classified data # The mean (brown dot) is about 60.24998. # We verify (by the test χ²) the conformity of the uniform distribution with the # experimental distribution. We do not know the interval of uniform distribution. Let # it be [a,b). We can calculate the mean and variance of the data, and express them as # a function of a and b (and then derive the values of a and b). M = mean(dataclas); V = Var(dataclas); M; V # 60.24998 457.2783 # In the case of uniform distribution, M = (a+b)/2, V = (b-a)^2/12, or: # a + b = 2*M, b - a = sqrt(V*12). b = (sqrt(V*12)+2*M)/2; a = (2*M-sqrt(V*12))/2; a; b # 23.21169 97.28827 line(a,1, b,1, "blue") L = b-a; pr=NULL; pr[1] = (30-a)/L; pr[8] = (b-90)/L; for(i in 2:7) pr[i] = 10/L pr # 0.09163906 0.13499544 0.13499544 0.13499544 0.13499544 0.13499544 0.13499544 0.09838829 ExpFr = n*pr; chi2 = sum((freq-ExpFr)^2/(ExpFr)); chi2 # 23.56327 # The degrees of freedom are 8-3 = 5 (I take 3 instead of 1 because I have also imposed # mean and variance, or a and b). p = c( 5, 10, 25, 50, 75, 90, 95 )/100; r = 5; print(qchisq(p,r), 3) # 1.15 1.61 2.67 4.35 6.63 9.24 11.07 # 23.6 is well beyond the 95th percentile. I therefore reject the hypothesis of # uniformity. # Now, we consider problems in which each member of a population can be classified # according to two distinct characteristics. An example. # Consider the following data (processed by an article of Spencer Russell, 1926); # frequencies of sunset colors and whether each was followed by rain are presented: # Sky color Number of observations Number followed by rain # Red 61 26 # Mainly red 198 53 # Yellow 159 81 # Mainly yellow 188 86 # Red and yellow 190 51 # Gray 302 167 # I want to value the hypothesis that whether it rains tomorrow is independent of the # color of today's sunset. There are various tests to do this, but they are not easy to # interpret. A graphical representation is enough: days = c(61,198,159,188,190,302) rain = c(26, 53, 81, 86, 51,167) BF=4.5; HF=3.5; Plane(0,320, 0, 180); POINT(days,rain,"brown") abovex("observations"); abovey("days followed by rain") LR0(days,rain,0,0) # 0 0.4385932 <- LR01 LR02 y=LR01+LR02*x f = function(x) LR01+LR02*x graph1(f,0,330, "grey") # An examination of the graph shows that there is no dependence between the two phenomena. # All data pairs, in a way that is practically independent of the sky conditions the night # before, are arranged more or less along a straight line passing through the origin. The # proverb "red sky at night, good weather in sight" has no foundation. # There are many other tests. You can explore them with this command (then you must # click "Search Engine & Keywords" and type "test" in the text field) help.start() # Here you can see same examples of using Student's t-test and other tests. # For further information click here: WikiPedia. These are developments of this