R2s

---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
 
# Let's see how to do some other statistical analysis of data.
 
# Here is an alternative way to those seen in g_B.htm and g_C.htm to build pies.
 
# The extensions of the hill, mountain and plain areas of "Piemonte" expressed in tens
# of square kilometers.  First way:
pie(c(770,1099,671))
# I add the names:
piemonte = c(770,1099,671)
names(piemonte) = c("hil","mou","pla")
pie(piemonte)
# I  choose the colors:
pie(piemonte, col=c("yellow","brown","green"))
 
      
 
# If I want to keep the previous diagram I can do the next in a new window, which I can
# open with the command dev.new()
 
# An alternative way to build barplot.  First way:
barplot(piemonte)
# I reduce the space between the columns and place a grid:
barplot(piemonte, space=0)
abline(h=c(200,400,600,800,1000),lty=3)
# I add the colors:
barplot(piemonte, space=0, col=c("yellow","brown","green"))
abline(h=c(200,400,600,800,1000),lty=3)
 
  
 
# Here is the diagram with the percentages:
 
colors = c("yellow","brown","green")
barplot(piemonte/sum(piemonte)*100,space=0,col=colors)
abline(h=c(10,20,30,40),lty=3)
                  
                      ---------- ---------- ---------- ----------
 
# The heights of some 13-year-old pupils  (analyzed with commands already seen):
pup = c(146,158,152,140,157,147,161,147,149,154,155,153,155,156,150,153,152,145)
sort(pup)
#[1] 140 145 146 147 147 149 150 152 152 153 153 154 155 155 156 157 158 161
histogram(pup)
#[the distance of the sides of the grid is   2.5 %] 
#  Frequencies and percentage freq.: 
#1, 5, 6, 5, 1
#5.56,27.78,33.33,27.78,5.56
#  For other statistics use the command   morestat() 
morestat()
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  140.0   147.5   152.5   151.7   155.0   161.0 
#    The brown dots are 5^ and 95^ percentiles 
#           The red dot is the mean 
Histogram(pup, 140,161, 3)
#[the distance of the sides of the grid is   1.5 %] 
#  Frequencies and percentage freq.: 
#1, 1, 3, 2, 5, 4, 2
#5.56,5.56,16.67,11.11,27.78,22.22,11.11
           
 
# What we obtain with the command hist.
hist(pup, right=FALSE)
# "right=FALSE" tells the program that the intervals are  […,…).
 

 
# If I add probability=TRUE I obtain the figure on the right, in which the vertical
# scale has changed: the probability densities are plotted (the relative frequencies
# divided by the amplitude of the intervals) so that the histogram has a total area
# equal to 1  [ we added also the option  col="yellow"  and the command BOX(); for the
# third graph I type again  hist(…, add=TRUE) with "add=TRUE" ].  Or:
 
hist(pup, right=FALSE, seq(140,161, 3), cex.axis=0.8)
 
              
# With summary I obtain similar results to those obtained with morestat()
summary(pup)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  140.0   147.5   152.5   151.7   155.0   161.0
# With statistic I obtain the boxplot, with statistics both
statistic(pup)
             
statistics(pup)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#  140.0   147.5   152.5   151.7   155.0   161.0
             
 
# If I want to compare this data with:
 
pup2 = c(146,158,152,140,157,147,161,147,149,154,155,153,
155,156,150,153,152,145,158,140,147,149,155,156,153,145,
141,156,158,151,161,148,157,159,160,162,149,163,152,149,
157,155)
# I can use:
 
BF=4;HF=1.2
boxAB=c(140,163); statistics(pup)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#140.0   147.5   152.5   151.7   155.0   161.0
boxAB=c(140,163); statistics(pup2)
# Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#140.0   149.0   153.0   152.6   157.0   163.0
 
        
 
# To compare the histograms i can use:
 
h1 = max(hist(pup,seq(140,164,3),plot=FALSE)$density); h1   # [1] 0.09259259
h2 = max(hist(pup2,seq(140,164,3),plot=FALSE)$density); h2  # [1] 0.07142857
h = max( c(h1,h2) ); h                                      # [1] 0.09259259
# the maximum of those that would be the heights of the two histograms
BF=4; HF=3
noBox=1; boX(140,165,0,h)
S = seq(140,164,3)
hist(pup, S,angle=45,density=15,probability=TRUE,add=TRUE,border=1,col="blue")
hist(pup2,S,angle=135,density=15,probability=TRUE,add=TRUE,border=1,col="red")
 
        
                      ---------- ---------- ---------- ----------
 
# I can introduce data values from the keyboard using the command scan(file="",n=...)
# where in place of ... you have to put the data to read. Two examples:
 
numbers = scan(file="", n=3); sum(numbers)
#1: 3
#2: 7
#3: 5
#Read 3 items
#[1] 15
 
# In the second example I introduce N and then N data (of which I calculate the mean):
 
npup = scan(file="",n=1); altpup = scan(file="",n=npup); mean(altpup)
#1: 7
#Read 1 item
#1: 154
#2: 156
#3: 163
#4: 170
#5: 161
#6: 156
#7: 159
#Read 7 items
#[1] 159.8571
                      ---------- ---------- ---------- ----------
 
# The data can be copied from the network and pasted into "R" or can be downloaded from
# the network. An example ("t_sec.txt" file in the R folder). I read a few lines (eg 4)
# of a file (I read a few because it can be very long, eg contain a few thousand data):
 
readLines("http://macosa.dima.unige.it/R/t_sec.txt",n=4)
#[1] " measures (truncated to centisecond) of a period of 1 s timed by hand"
#[2] "111"
#[3] "103"
#[4] "109"
 
# I see that the data is integer and that there is 1 line to jump. I do it with the
# skip command. The data is separated by an "end-of-line". I read them with the command
# "scan" (if they were separated by, for example, ";" I would add sep=";")
 
data = scan("http://macosa.dima.unige.it/R/t_sec.txt", skip=1)
# Read 47 items
 
# Alternatively, I can examine the file from a browser, copy it and then use
# scan("clipboard", skip=1)
 
# I can also use (from File menu) OpenScript/ApriScript (and NewScript/NuovoScript):
# see here.
 
      
 
# The str command displays the structure of the objects. I use it to examine "data"
 
str(data)
#  num [1:47] 111 103 109 97 99 110 99 103 109 106 ...
# They are 47. They are truncated to the integers. To make correct evaluations I need
# to have rounded values: I add 1/2. 
 
DATA = data+1/2
summary(DATA)
#   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#  68.50   96.50   98.50   99.86  108.00  129.50
# Esaminandoli con histogram o con hist:
 
     
                      ---------- ---------- ---------- ----------
 
# If I have data classified in different amplitude ranges I can use the histoclas
# command: see.
                      ---------- ---------- ---------- ----------
 
# We can generate random numbers. For example
runif(4, 1,10)
# [I can obtain  9.139260 3.747637 3.847650 2.825768]
# generates 4 random numbers "uniformly" distributed on the interval (1,10).
# Whereas:
RUNIF(20, 1,6)
# [I can obtain  2 5 4 2 3 5 6 5 6 2 5 1 5 1 6 2 1 3 1 6 ]
# generates 20 random integers "uniformly" distributed on {1,2,3,4,5,6}.
# and
RUNIF2(20, 1,6, c(2,1,1,1,1,0.5))
# generates 20 random integers distributed on {1,2,3,4,5,6} where 1 [2] has the chance
# to get out double [half] compared to the chances of 2,3,4,5 (a die with face "1"
# heavier than the face "6")
# [I can obtain  5 3 1 4 2 5 5 2 4 6 1 1 2 1 2 2 1 2 2 1 ]
 
# If I want generate random elements from a generic set I can use RunIf: see here
 
# Let's see the throwing, ten thousand times, of two balanced dice:
 
n = 10000; dice = RUNIF(n, 1,6) + RUNIF(n, 1,6)
 
# The values have been put into "dice". Here are some:
 
dice[1]; dice[1:10]
# [1]  7
# [1]  7  9  9  7  8  6  4  10  7  7
 
hist(dice, seq(1.5,12.5,1),probability=TRUE,col="yellow")
abline(h=c(0.05,0.1,0.15), lty=3)
 
        
 
# The possible outcomes are 6*6 = 36; we assumed they are equally probable: everyone
# has probability 1/36. For example, 3 can exit in 2 cases: 1 and 2, 2 and 1, so with
# the probability: 2*1/36 = 0.0555…; and indeed with 10 000 launches I got a percentage
# close to 5.55…% The histogram on the right was obtained with the Histogram command:
 
noClass=1; Histogram(dice, 1.5,12.5, 1)
#         Frequencies and percentage freq.:
#277, 554, 857, 1084, 1442, 1663, 1337, 1126, 862, 532, 266
#2.77,5.54,8.57,10.84,14.42,16.63,13.37,11.26,8.62,5.32,2.66
# The percentage freq. of 3 is 5.54% (ie 0.0554).
# But if I roll the die (considered above) with the
# face "1" heavier than the face "6":
 
D = c(2,1,1,1,1,0.5)
n = 10000; dice = RUNIF2(n, 1,6,D) + RUNIF2(n, 1,6,D)
hist(dice, seq(1.5,12.5,1),probability=TRUE,col="yellow")
abline(h=c(0.05,0.1,0.15), lty=3)
#
# It is easy simulate a process and produce an animated
# histogram: see here
   
 
# I can also find probabilities about phenomena that are not easy to study in
# theoretical terms. Let's see how to study the outputs of the launch of a pair of
# balanced dice, if we take as output the difference of the values that come out, which
# can go from 0 to 5. We expect 5 to be the least probable output (It only comes out if
# 6 and 1 are drawn) only if a 6 and 1), but it is not easy to evaluate other cases.
# So let's take the absolute value of the difference between the drawn numbers.
 
n = 1e7; dice = abs( RUNIF(n,1,6)-RUNIF(n,1,6) )
Histogram(dice, -0.5,5.5, 1)
 
#         Frequencies and percentage freq.:
#1666379, 2778367, 2221573, 1665591, 1111816, 556274
#16.66,27.78,22.22,16.66,11.12,5.56
   
 
# With n = 10^7 (and with n = 10^6 too) I get the most frequent output is 3 (27.78% of
# probability).  At this point I can try to study the problem even theoretically …
 
# A use of RUNIF to simulate the
# Brownian motion of a particle
# (the positions that the particle
# occupies tend to be equidistributed).
# See here.
    
                      ---------- ---------- ---------- ----------
 
# This was an example of simulation. To estimate the accuracy of the estimate, you
# can use the following command:
 
Pr = function(n) {f = 0; for (i in 1:n) f = f + ifelse(Event(),1,0);
       fr=f/n; S=sqrt(fr*(1-fr)/(n-1)); cat(fr, "+/-", 3*S,'\n'); fr0 <<-fr; n0 <<-n }
 
# If, after Pr, I use Pr2 I can increase the number of experiments:
 
Pr2 = function(n) {f=0; for (i in 1:n) f=f+ifelse(Event(),1,0); n=n+n0; f=f+fr0*n0
 fr=f/n;S=sqrt(fr*(1-fr)/(n-1));cat(fr,"+/-",3*S,'  n =',n,'\n'); fr0<<-fr; n0<<-n}
 
# or, if I want the result in percentage form:
 
PR = function(n) {f = 0; for (i in 1:n) f = f + ifelse(Event(),1,0)
 fr=f/n; S=sqrt(fr*(1-fr)/(n-1)); cat(fr*100, "+/-", 3*S*100,'\n'); fr0<<- fr; n0<<- n}
 
PR2 = function(n) {f=0; for (i in 1:n) f=f+ifelse(Event(),1,0); n=n+n0; f=f+fr0*n0
 fr=f/n;S=sqrt(fr*(1-fr)/(n-1));cat(fr*100,"+/-",3*S*100,'  n =',n,'\n');fr0<<-fr;n0<<-n}
 
# I put a description in  Event  that is 1 if the phenomenon happens
#                                        0 if it does not happen.
 
# Let us consider, referring to the example above, the probability that throwing a pair
# of balanced dice their difference is 1.
 
Event = function() abs( RUNIF(1, 1,6)-RUNIF(1, 1,6) ) == 1
Pr(1e4)
#0.2825 +/- 0.01350713 
Pr(1e5)
#0.27771 +/- 0.004248885 
# If I use PR
PR(1e4)
#28.25 +/- 1.350713  [%]
PR(1e6)
#27.7448 +/- 0.1343219 
PR2(4e6)                   # you must wait a few seconds
#27.7586 +/- 0.06007975   n = 5e+06
PR2(5e6)
#27.7754 +/- 0.04249071   n = 1e+07
 
# We can deduce that the % probability is 28.25±1.35, indeed … 27.7754±0.000425 or
# 27.775±0.00043. Going forward (if I have time …) I can find more accurate evaluations
# (these values are not certain, but have a probability of 99.7% - see here).
# I can suppose that the probability is 27.777…/100 = (2+0.777…)/10 = (2+7/9)/10 = 25/90 = 5/18
# In this case it is easy to find the exact probability: the right output are 10 (12,23,
# 34,45,56,65,54,43,32,21) and 10/36 = 5/18.
 
# See here how to modify Pr to work on its outcomes (Buffon's needle problem)
 
# Another example.  A well-mixed pack of 40 playing cards is split (entirely randomly)
# in two equal parts. With what probability the same number of black cards and red ones
# is on both sides?
# It's enough that I pull out twenty cards and check if there are ten black cards
# between them.
# I put in random order the first 40 positive integers in y. I consider the numbers
# from 1 to 20 as black cards and the numbers from 21 to 40 as red cards.
# I check if among the first 20 numbers there are 10 less than 21.
 
Event = function() { y = sample(1:40); k=0; for(i in 1:20) if(y[i] < 21) k=k+1
                     ifelse(k==10, 1, 0) }
Pr(1e2)
#0.29 +/- 0.1368144
Pr(1e3)
#0.248 +/- 0.04098956
Pr(1e4)
#0.2527 +/- 0.01303746
Pr(1e5)
#0.2452 +/- 0.00408131
Pr(1e6)                     # This calculation takes about 1 minute
#0.247499 +/- 0.001294678
Pr(1e7)                     # This calculation takes a few minutes
#0.247592 +/- 0.0004094645
 
# I conclude that (99.7 per cent) the probability is 24.76 ± 0.04 %.
# If I reflect, I can understand that the probability is:
choose(20,10)*choose(20,10)/choose(40,20)
#0.2476289
 
# Another one. In a version of a dice game ("zara") players throw three dice; who first
# gets at least 2 equal numbers wins. What is the probability of getting this in a
# throw?
Event=function() {a=RUNIF(1,1,6);b=RUNIF(1,1,6);c=RUNIF(1,1,6); ifelse(a==b|a==c|b==c,1,0)}
Pr(1e4)
#0.4462 +/- 0.01491366 
Pr2(9e4)
#0.44245 +/- 0.004711915   n = 1e+05 
Pr2(1e5)
#0.44411 +/- 0.00333309   n = 2e+05 
Pr2(2e5)
#0.4433775 +/- 0.002356454   n = 4e+05 
Pr2(4e5)
#0.4435712 +/- 0.001666338   n = 8e+05 
Pr2(8e5)
#0.4439169 +/- 0.001178371   n = 1600000 
# I suppose it is:
fraction(0.44444444444444444444)
# 4/9
# I reflect. The probability that the 2nd output is different from the first one is 5/6.
# The probablity that the third output is different from the first two is 4/6. The
# probablity that the 3 outputs are different is 5/6*4/6. So the probability that at
# least 2 are equal is:
fraction(1-5/6*4/6)
#4/9
 
# A more complex evaluation:
# what is the probability (by drawing 10 cards from a 40-pack deck) to get at least a
# tris (ie 3 or 4 cards of the same value)?
Event = function() {V=0; N=NULL; for(i in 1:10) N[i]=0; car=sample(1:40);
                   # X%%10 is 3 if X is 3 or 13 or 23 or 33 
                    i=1; while(i<11) {x=1+car[i]%%10; N[x]=N[x]+1;
                                      if(N[x]==3) {i=11; V=1} else i=i+1}; V}
 
# After describing the event, let's value the probability:
PR(1e3)
#40.9 +/- 4.666528 
PR(1e4)
#37.65 +/- 1.453596 
PR(1e5)                 # For this output, wait a few seconds!
#38.396 +/- 0.4613929
PR(1e6)                 # For this output, wait a few minutes!
#38.4978 +/- 0.1459771
PR(1e7)                 # For this output, wait more time
#38.43589 +/- 0.04614807
 
#  The probability is (38.436�0.047)% or (38.44�0.05)%.

# If I want generate a random permutation of the elements M,M+1,...,N, I can use
# sample(M:N). Examples:
 
sample(14:20)
#14 16 20 15 17 19 18
 
# This is useful in many situations. Eg: a generic order in which I can extract the
# thirteen hearts cards:
 
sample(1:13)
#4  9  12  3  13  7  10  11  5  6  2  1  8
 
# I can extract (randomly) 4 numbers between 1 and 13:
sample(1:13, 4)
#6 10 11 12

# The numbers generated with runif and sample appear distributed randomly. Actually, I
# can get the same sequence of numbers if I write set.seed(n) with the same integer n.
# Example
sample(5:10); set.seed(273); sample(5:10); set.seed(273); sample(5:10)
#  6 10  7  5  8  9
#  6  7  5  9 10  8
#  6  7  5  9 10  8
set.seed(273); sample(5:10,3)
#  6  7  5
 
# This is useful, for example, to control a procedure in which you want to use the
# random number generator.
# If you want to know the structure of a random number generator see here.

# With these commands, Code(seed,sentence) and Decode(…,…) were also built (to make
# "secret" encodings); a whole number must be put into "seed". Here are some examples
# (see here if you want to see how they were made):
 
sentence = "Ci vediamo alle 17:30 davanti a casa tua. Ciao!"
Code(314, sentence)
# "\"m%rfgmcew%c~~f%5(*-?%gcrcdqm%c%vcac%q|c6%\"mcw/"
Decode(314, "\"m%rfgmcew%c~~f%5(*-?%gcrcdqm%c%vcac%q|c6%\"mcw/")
# "Ci vediamo alle 17:30 davanti a casa tua. Ciao!"
sentence2 = "195/3 = 65"
Code(-10, sentence2)
# "F!I74TXTSI"
Decode(-10, "F!I74TXTSI")
# "195/3 = 65"
 
With the commands sample(N) and SAMPLE(N) you can reorder the first N positive natural
numbers or the first N letters (if N < 27)
set.seed(173); sample(9); set.seed(173); SAMPLE(9)
# 3 2 7 4 9 1 5 8 6
# "CBGDIAEHF"
 
                      ---------- ---------- ---------- ----------
 
# Technical NOTE on how to "save" the graphics and commands used.
 
# To store an image I can click on it and (from the menu that appears) save it as
# BitMap and then
# -- paste it directly into a document of the program you are writing with (OpenOffice,
# LibreOffice, Wordpad, …, Word, …) or
# -- paste it into Paint (or another graphical program), add what you want, move it and
# lower the margins,  and then either
- copy it into any document you are writing  or
- form Paint, save it in PNG or GIF format, which occupy little space.
 
# To save the used commands I can (from the File menu) run Save History that saves all
# executed commands in a text-file (I must give a name; I can leave the "Rhistory"
# extension or put the "txt" extension).  Then, afterwards, I can
# [ or  automatically load commands from the file using the Load History command,
#   which can also be used to automatically recall commands uploaded to other
#   files, or ]
# open the file with NotePad or another editor and copy and paste in R the sequence of
# commands I want.
 
# For a complete formulary of all the commands, see here.
 
                      ---------- ---------- ---------- ----------
 
# For more advanced uses (correlation, asimmetry, QQ plot, linear regression, variance,
# Gaussian, other distributions, Contingency Tables, ...) go HERE.
 
                 
 
# We just remember that beside the commands (present in the software) var(data) and
# sd(data), which calculate "experimental" variance and standard deviation (ie variance
# multiply by n/(n-1), if n is the number of data) and its square root, you can use:
 
# Var(data), Sd(data) and SdM(data)  that calculate the "theoretic"  variance
# and standard deviation (or root mean square) and the standard deviation of the mean.
# Instead of Sd, sqm (in Italian: "scarto quadratico medio") also is used.
# If Xi are the data and M is the mean of them:
 
#  Sd  is  √( ( (X1-M)^2 + … + (Xn-M)^2 ) / n )        = √ Var
 
#  Var is   ( (X1-M)^2 + … + (Xn-M)^2 ) / n
 
#  sd  is  √( ( (X1-M)^2 + … + (Xn-M)^2 ) / (n-1) )    = √ var
 
#  var is   ( (X1-M)^2 + … + (Xn-M)^2 ) / (n-1)
 
#  SdM is  √( ( (X1-M)^2 + … + (Xn-M)^2 ) / (n-1) / n )  =  sd / √n
# [or sdM]
 
# If I have the outcomes with their frequency I can compute the weighted mean and
# analogous value with: Wmean(data,frequencies) and Wmedian, WSd (or Wsqm), WVar:
 
d1=c(4,5,5,5,6,6);          mean(d1);   median(d1);   Sd(d1);   Var(d1)
#                          5.166667       5         0.6871843    0.4722222
d = c(4,5,6); f = c(1,3,2); Wmean(d,f); Wmedian(d,f); WSd(d,f); WVar(d,f)
#                          5.166667       5         0.6871843    0.4722222
 
# I can compute the number n of data:  n = sum(f)
 
# If I want compute "Median" (see here) I can use WMedian:
d1=c(1,1,1,5,6,6,6,6); d=c(1,2,3,5,6); f=c(1,1,1,1,4)
Median(d1); WMedian(d,f); median(d1); Wmedian(d,f)
#   5           5           5.5          5.5

 
Other examples of use