---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
 
# Let's see how to do some other statistical analysis of data.
 
# Here is an alternative way to those seen in g_B.htm and g_C.htm to build pies.
 
# The extensions of the hill, mountain and plain areas of "Piemonte" expressed in tens
# of square kilometers.  First way:
pie(c(770,1099,671))
# I add the names:
piemonte = c(770,1099,671)
names(piemonte) = c("hil","mou","pla")
pie(piemonte)
# I  choose the colors:
pie(piemonte, col=c("yellow","brown","green"))
 
      
 
# If I want to keep the previous diagram I can do the next in a new window, which I can
# open with the command dev.new()
 
# An alternative way to build barplot.  First way:
barplot(piemonte)
# I reduce the space between the columns and place a grid:
barplot(piemonte, space=0)
abline(h=c(200,400,600,800,1000),lty=3)
# I add the colors:
barplot(piemonte, space=0, col=c("yellow","brown","green"))
abline(h=c(200,400,600,800,1000),lty=3)
 
  
 
# Here is the diagram with the percentages:
 
colors = c("yellow","brown","green")
barplot(piemonte/sum(piemonte)*100,space=0,col=colors)
abline(h=c(10,20,30,40),lty=3)
                  
                      ---------- ---------- ---------- ----------
 
# The heights of some 13-year-old pupils  (analyzed with commands already seen):
pup = c(146,158,152,140,157,147,161,147,149,154,155,153,155,156,150,153,152,145)
sort(pup)
#[1] 140 145 146 147 147 149 150 152 152 153 153 154 155 155 156 157 158 161
histogram(pup)
#[the distance of the sides of the grid is   2.5 %] 
#  Frequencies and percentage freq.: 
#1, 5, 6, 5, 1
#5.56,27.78,33.33,27.78,5.56
#  For other statistics use the command   morestat() 
morestat()
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  140.0   147.5   152.5   151.7   155.0   161.0 
#    The brown dots are 5^ and 95^ percentiles 
#           The red dot is the mean 
Histogram(pup, 140,161, 3)
#[the distance of the sides of the grid is   1.5 %] 
#  Frequencies and percentage freq.: 
#1, 1, 3, 2, 5, 4, 2
#5.56,5.56,16.67,11.11,27.78,22.22,11.11
           
 
# What we obtain with the command hist.
hist(pup, right=FALSE)
# "right=FALSE" tells the program that the intervals are  […,…).
 

 
# If I add probability=TRUE I obtain the figure on the right, in which the vertical
# scale has changed: the probability densities are plotted (the relative frequencies
# divided by the amplitude of the intervals) so that the histogram has a total area
# equal to 1  [ we added also the option  col="yellow"  and the command BOX(); for the
# third graph I type again  hist(…, add=TRUE) with "add=TRUE" ].  Or:
 
hist(pup, right=FALSE, seq(140,161, 3), cex.axis=0.8)
 
              
# With summary I obtain similar results to those obtained with morestat()
summary(pup)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  140.0   147.5   152.5   151.7   155.0   161.0
# With statistic I obtain the boxplot, with statistics both
statistic(pup)
             
statistics(pup)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  140.0   147.5   152.5   151.7   155.0   161.0
             
 
# If I want to compare this data with:
 
pup2 = c(146,158,152,140,157,147,161,147,149,154,155,153,
155,156,150,153,152,145,158,140,147,149,155,156,153,145,
141,156,158,151,161,148,157,159,160,162,149,163,152,149,
157,155)
# I can use:
 
BF=4;HF=1.2
boxAB=c(140,163); statistics(pup)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#140.0   147.5   152.5   151.7   155.0   161.0
boxAB=c(140,163); statistics(pup2)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#140.0   149.0   153.0   152.6   157.0   163.0
 
        
 
# To compare the histograms i can use:
 
h1 = max(hist(pup,seq(140,164,3),plot=FALSE)$density); h1   # [1] 0.09259259
h2 = max(hist(pup2,seq(140,164,3),plot=FALSE)$density); h2  # [1] 0.07142857
h = max( c(h1,h2) ); h                                      # [1] 0.09259259
# the maximum of those that would be the heights of the two histograms
BF=4; HF=3
noBox=1; boX(140,165,0,h)
S = seq(140,164,3)
hist(pup, S,angle=45,density=15,probability=TRUE,add=TRUE,border=1,col="blue")
hist(pup2,S,angle=135,density=15,probability=TRUE,add=TRUE,border=1,col="red")
 
        
                      ---------- ---------- ---------- ----------
 
# I can introduce data values from the keyboard using the command scan(file="",n=...)
# where in place of ... you have to put the data to read. Two examples:
 
numbers = scan(file="", n=3); sum(numbers)
#1: 3
#2: 7
#3: 5
#Read 3 items
#[1] 15
 
# In the second example I introduce N and then N data (of which I calculate the mean):
 
npup = scan(file="",n=1); altpup = scan(file="",n=npup); mean(altpup)
#1: 7
#Read 1 item
#1: 154
#2: 156
#3: 163
#4: 170
#5: 161
#6: 156
#7: 159
#Read 7 items
#[1] 159.8571
                      ---------- ---------- ---------- ----------
 
# The data can be copied from the network and pasted into "R" or can be downloaded from
# the network. An example ("t_sec.txt" file in the R folder). I read a few lines (eg 4)
# of a file (I read a few because it can be very long, eg contain a few thousand data):
 
readLines("http://macosa.dima.unige.it/R/t_sec.txt",n=4)
#[1] " measures (truncated to centisecond) of a period of 1 s timed by hand"
#[2] "111"
#[3] "103"
#[4] "109"
 
# I see that the data is integer and that there is 1 line to jump. I do it with the
# skip command. The data is separated by an "end-of-line". I read them with the command
# "scan" (if they were separated by, for example, ";" I would add sep=";")
 
data = scan("http://macosa.dima.unige.it/R/t_sec.txt", skip=1)
# Read 47 items
 
# Alternatively, I can examine the file from a browser, copy it and then use
# scan("clipboard", skip=1)
 
# I can also use (from File menu) OpenScript/ApriScript (and NewScript/NuovoScript):
# see here.
 
      
 
# The str command displays the structure of the objects. I use it to examine "data"
 
str(data)
#  num [1:47] 111 103 109 97 99 110 99 103 109 106 ...
# They are 47. They are truncated to the integers. To make correct evaluations I need
# to have rounded values: I add 1/2. 
 
DATA = data+1/2
summary(DATA)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#  68.50   96.50   98.50   99.86  108.00  129.50
# Esaminandoli con histogram o con hist:
 
     
                      ---------- ---------- ---------- ----------
 
# If I have data classified in different amplitude ranges I can use the histoclas
# command: see.
                      ---------- ---------- ---------- ----------
 
# We can generate random numbers. For example
runif(4, 1,10)
# [I can obtain  9.139260 3.747637 3.847650 2.825768]
# generates 4 random numbers "uniformly" distributed on the interval (1,10).
# Whereas:
RUNIF(20, 1,6)
# [I can obtain  2 5 4 2 3 5 6 5 6 2 5 1 5 1 6 2 1 3 1 6 ]
# generates 20 random integers "uniformly" distributed on {1,2,3,4,5,6}.
# and
RUNIF2(20, 1,6, c(2,1,1,1,1,0.5))
# generates 20 random integers distributed on {1,2,3,4,5,6} where 1 [2] has the chance
# to get out double [half] compared to the chances of 2,3,4,5 (a die with face "1"
# heavier than the face "6")
# [I can obtain  5 3 1 4 2 5 5 2 4 6 1 1 2 1 2 2 1 2 2 1 ]
 
# If I want generate random elements from a generic set I can use RunIf: see here
 
# Let's see the throwing, ten thousand times, of two balanced dice:
 
n = 10000; dice = RUNIF(n, 1,6) + RUNIF(n, 1,6)
 
# The values have been put into "dice". Here are some:
 
dice[1]; dice[1:10]
# [1]  7
# [1]  7  9  9  7  8  6  4  10  7  7
 
hist(dice, seq(1.5,12.5,1),probability=TRUE,col="yellow")
abline(h=c(0.05,0.1,0.15), lty=3)
 
        
 
# The possible outcomes are 6*6 = 36; we assumed they are equally probable: everyone
# has probability 1/36. For example, 3 can exit in 2 cases: 1 and 2, 2 and 1, so with
# the probability: 2*1/36 = 0.0555…; and indeed with 10 000 launches I got a percentage
# close to 5.55…% The histogram on the right was obtained with the Histogram command:
 
noClass=1; Histogram(dice, 1.5,12.5, 1)
#         Frequencies and percentage freq.:
#277, 554, 857, 1084, 1442, 1663, 1337, 1126, 862, 532, 266
#2.77,5.54,8.57,10.84,14.42,16.63,13.37,11.26,8.62,5.32,2.66
# The percentage freq. of 3 is 5.54% (ie 0.0554).
# But if I roll the die (considered above) with the
# face "1" heavier than the face "6":
 
D = c(2,1,1,1,1,0.5)
n = 10000; dice = RUNIF2(n, 1,6,D) + RUNIF2(n, 1,6,D)
hist(dice, seq(1.5,12.5,1),probability=TRUE,col="yellow")
abline(h=c(0.05,0.1,0.15), lty=3)
#
# It is easy simulate a process and produce an animated
# histogram: see here
   
# I can also find probabilities about phenomena that are not easy to study in # theoretical terms. Let's see how to study the outputs of the launch of a pair of # balanced dice, if we take as output the difference of the values that come out, which # can go from 0 to 5. We expect 5 to be the least probable output (It only comes out if # 6 and 1 are drawn) only if a 6 and 1), but it is not easy to evaluate other cases. # So let's take the absolute value of the difference between the drawn numbers. n = 1e7; dice = abs( RUNIF(n,1,6)-RUNIF(n,1,6) ) Histogram(dice, -0.5,5.5, 1) # Frequencies and percentage freq.: #1666379, 2778367, 2221573, 1665591, 1111816, 556274 #16.66,27.78,22.22,16.66,11.12,5.56 # With n = 10^7 (and with n = 10^6 too) I get the most frequent output is 3 (27.78% of # probability). At this point I can try to study the problem even theoretically
# A use of RUNIF to simulate the
# Brownian motion of a particle
# (the positions that the particle
# occupies tend to be equidistributed).
# See here.
   
---------- ---------- ---------- ---------- # This was an example of simulation. To estimate the accuracy of the estimate, you # can use the following command: Pr = function(n) {f = 0; for (i in 1:n) f = f + ifelse(Event(),1,0); fr=f/n; S=sqrt(fr*(1-fr)/(n-1)); cat(fr, "+/-", 3*S,'\n'); fr0 <<-fr; n0 <<-n } # If, after Pr, I use Pr2 I can increase the number of experiments: Pr2 = function(n) {f=0; for (i in 1:n) f=f+ifelse(Event(),1,0); n=n+n0; f=f+fr0*n0 fr=f/n;S=sqrt(fr*(1-fr)/(n-1));cat(fr,"+/-",3*S,' n =',n,'\n'); fr0<<-fr; n0<<-n} # or, if I want the result in percentage form: PR = function(n) {f = 0; for (i in 1:n) f = f + ifelse(Event(),1,0) fr=f/n; S=sqrt(fr*(1-fr)/(n-1)); cat(fr*100, "+/-", 3*S*100,'\n'); fr0<<- fr; n0<<- n} PR2 = function(n) {f=0; for (i in 1:n) f=f+ifelse(Event(),1,0); n=n+n0; f=f+fr0*n0 fr=f/n;S=sqrt(fr*(1-fr)/(n-1));cat(fr*100,"+/-",3*S*100,' n =',n,'\n');fr0<<-fr;n0<<-n} # I put a description in Event that is 1 if the phenomenon happens # 0 if it does not happen. # Let us consider, referring to the example above, the probability that throwing a pair # of balanced dice their difference is 1. Event = function() abs( RUNIF(1, 1,6)-RUNIF(1, 1,6) ) == 1 Pr(1e4) #0.2825 +/- 0.01350713 Pr(1e5) #0.27771 +/- 0.004248885 # If I use PR PR(1e4) #28.25 +/- 1.350713 [%] PR(1e6) #27.7448 +/- 0.1343219 PR2(4e6) # you must wait a few seconds #27.7586 +/- 0.06007975 n = 5e+06 PR2(5e6) #27.7754 +/- 0.04249071 n = 1e+07 # We can deduce that the % probability is 28.25±1.35, indeed … 27.7754±0.000425 or # 27.775±0.00043. Going forward (if I have time …) I can find more accurate evaluations # (these values are not certain, but have a probability of 99.7% - see here). # I can suppose that the probability is 27.777…/100 = (2+0.777…)/10 = (2+7/9)/10 = 25/90 = 5/18 # In this case it is easy to find the exact probability: the right output are 10 (12,23, # 34,45,56,65,54,43,32,21) and 10/36 = 5/18. # See here how to modify Pr to work on its outcomes (Buffon's needle problem) # Another example. A well-mixed pack of 40 playing cards is split (entirely randomly) # in two equal parts. With what probability the same number of black cards and red ones # is on both sides? # It's enough that I pull out twenty cards and check if there are ten black cards # between them. # I put in random order the first 40 positive integers in y. I consider the numbers # from 1 to 20 as black cards and the numbers from 21 to 40 as red cards. # I check if among the first 20 numbers there are 10 less than 21. Event = function() { y = sample(1:40); k=0; for(i in 1:20) if(y[i] < 21) k=k+1 ifelse(k==10, 1, 0) } Pr(1e2) #0.29 +/- 0.1368144 Pr(1e3) #0.248 +/- 0.04098956 Pr(1e4) #0.2527 +/- 0.01303746 Pr(1e5) #0.2452 +/- 0.00408131 Pr(1e6) # This calculation takes about 1 minute #0.247499 +/- 0.001294678 Pr(1e7) # This calculation takes a few minutes #0.247592 +/- 0.0004094645 # I conclude that (99.7 per cent) the probability is 24.76 ± 0.04 %. # If I reflect, I can understand that the probability is: choose(20,10)*choose(20,10)/choose(40,20) #0.2476289 # Another one. In a version of a dice game ("zara") players throw three dice; who first # gets at least 2 equal numbers wins. What is the probability of getting this in a # throw? Event=function() {a=RUNIF(1,1,6);b=RUNIF(1,1,6);c=RUNIF(1,1,6); ifelse(a==b|a==c|b==c,1,0)} Pr(1e4) #0.4462 +/- 0.01491366 Pr2(9e4) #0.44245 +/- 0.004711915 n = 1e+05 Pr2(1e5) #0.44411 +/- 0.00333309 n = 2e+05 Pr2(2e5) #0.4433775 +/- 0.002356454 n = 4e+05 Pr2(4e5) #0.4435712 +/- 0.001666338 n = 8e+05 Pr2(8e5) #0.4439169 +/- 0.001178371 n = 1600000 # I suppose it is: fraction(0.44444444444444444444) # 4/9 # I reflect. The probability that the 2nd output is different from the first one is 5/6. # The probablity that the third output is different from the first two is 4/6. The # probablity that the 3 outputs are different is 5/6*4/6. So the probability that at # least 2 are equal is: fraction(1-5/6*4/6) #4/9 # A more complex evaluation: # what is the probability (by drawing 10 cards from a 40-pack deck) to get at least a # tris (ie 3 or 4 cards of the same value)? Event = function() {V=0; N=NULL; for(i in 1:10) N[i]=0; car=sample(1:40); # X%%10 is 3 if X is 3 or 13 or 23 or 33 i=1; while(i<11) {x=1+car[i]%%10; N[x]=N[x]+1; if(N[x]==3) {i=11; V=1} else i=i+1}; V} # After describing the event, let's value the probability: PR(1e3) #40.9 +/- 4.666528 PR(1e4) #37.65 +/- 1.453596 PR(1e5) # For this output, wait a few seconds! #38.396 +/- 0.4613929 PR(1e6) # For this output, wait a few minutes! #38.4978 +/- 0.1459771 PR(1e7) # For this output, wait more time #38.43589 +/- 0.04614807 # The probability is (38.436±0.047)% or (38.44±0.05)%. # If I want generate a random permutation of the elements M,M+1,...,N, I can use # sample(M:N). Examples: sample(14:20) #14 16 20 15 17 19 18 # This is useful in many situations. Eg: a generic order in which I can extract the # thirteen hearts cards: sample(1:13) #4 9 12 3 13 7 10 11 5 6 2 1 8 # I can extract (randomly) 4 numbers between 1 and 13: sample(1:13, 4) #6 10 11 12 # The numbers generated with runif and sample appear distributed randomly. Actually, I # can get the same sequence of numbers if I write set.seed(n) with the same integer n. # Example sample(5:10); set.seed(273); sample(5:10); set.seed(273); sample(5:10) # 6 10 7 5 8 9 # 6 7 5 9 10 8 # 6 7 5 9 10 8 set.seed(273); sample(5:10,3) # 6 7 5 # This is useful, for example, to control a procedure in which you want to use the # random number generator. # If you want to know the structure of a random number generator see here. # With these commands, Code(seed,sentence) and Decode(…,…) were also built (to make # "secret" encodings); a whole number must be put into "seed". Here are some examples # (see here if you want to see how they were made): sentence = "Ci vediamo alle 17:30 davanti a casa tua. Ciao!" Code(314, sentence) # "\"m%rfgmcew%c~~f%5(*-?%gcrcdqm%c%vcac%q|c6%\"mcw/" Decode(314, "\"m%rfgmcew%c~~f%5(*-?%gcrcdqm%c%vcac%q|c6%\"mcw/") # "Ci vediamo alle 17:30 davanti a casa tua. Ciao!" sentence2 = "195/3 = 65" Code(-10, sentence2) # "F!I74TXTSI" Decode(-10, "F!I74TXTSI") # "195/3 = 65" With the commands sample(N) and SAMPLE(N) you can reorder the first N positive natural numbers or the first N letters (if N < 27) set.seed(173); sample(9); set.seed(173); SAMPLE(9) # 3 2 7 4 9 1 5 8 6 # "CBGDIAEHF" ---------- ---------- ---------- ---------- # Technical NOTE on how to "save" the graphics and commands used. # To store an image I can click on it and (from the menu that appears) save it as # BitMap and then # -- paste it directly into a document of the program you are writing with (OpenOffice, # LibreOffice, Wordpad, …, Word, …) or # -- paste it into Paint (or another graphical program), add what you want, move it and # lower the margins, and then either - copy it into any document you are writing or - form Paint, save it in PNG or GIF format, which occupy little space. # To save the used commands I can (from the File menu) run Save History that saves all # executed commands in a text-file (I must give a name; I can leave the "Rhistory" # extension or put the "txt" extension). Then, afterwards, I can # [ or automatically load commands from the file using the Load History command, # which can also be used to automatically recall commands uploaded to other # files, or ] # open the file with NotePad or another editor and copy and paste in R the sequence of # commands I want. # For a complete formulary of all the commands, see here. ---------- ---------- ---------- ---------- # For more advanced uses (correlation, asimmetry, QQ plot, linear regression, variance, # Gaussian, other distributions, Contingency Tables, ...) go HERE. # We just remember that beside the commands (present in the software) var(data) and # sd(data), which calculate "experimental" variance and standard deviation (ie variance # multiply by n/(n-1), if n is the number of data) and its square root, you can use: # Var(data), Sd(data) and SdM(data) that calculate the "theoretic" variance # and standard deviation (or root mean square) and the standard deviation of the mean. # Instead of Sd, sqm (in Italian: "scarto quadratico medio") also is used. # If Xi are the data and M is the mean of them: # Sd is √( ( (X1-M)^2 + … + (Xn-M)^2 ) / n ) = Var # Var is ( (X1-M)^2 + … + (Xn-M)^2 ) / n # sd is √( ( (X1-M)^2 + … + (Xn-M)^2 ) / (n-1) ) = var # var is ( (X1-M)^2 + … + (Xn-M)^2 ) / (n-1) # SdM is √( ( (X1-M)^2 + … + (Xn-M)^2 ) / (n-1) / n ) = sd / n # [or sdM] # If I have the outcomes with their frequency I can compute the weighted mean and # analogous value with: Wmean(data,frequencies) and Wmedian, WSd (or Wsqm), WVar: d1=c(4,5,5,5,6,6); mean(d1); median(d1); Sd(d1); Var(d1) # 5.166667 5 0.6871843 0.4722222 d = c(4,5,6); f = c(1,3,2); Wmean(d,f); Wmedian(d,f); WSd(d,f); WVar(d,f) # 5.166667 5 0.6871843 0.4722222 # I can compute the number n of data: n = sum(f) # If I want compute "Median" (see here) I can use WMedian: d1=c(1,1,1,5,6,6,6,6); d=c(1,2,3,5,6); f=c(1,1,1,1,4) Median(d1); WMedian(d,f); median(d1); Wmedian(d,f) # 5 5 5.5 5.5 Other examples of use