---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
# [  source("http://macosa.dima.unige.it/r.R")  ]
# We refer to the introduction to R language here, here and here for the bases.
# We refer to WikiPedia (the English version only!) to go deep into Statistics

#                      ---------- ---------- ---------- ----------

# To have numerical indicators of asimmetry (skewness) of a data distribution  see here.

#                      ---------- ---------- ---------- ----------

# What does "to take randomly" mean? It depends on the law of distribution with which
# it is done. Two processes that generate a point falling into the circle with center O
# and radius 1 and test if its distance from O is less than 1/2.

BF=3;HF=3; BOXW(-1,1, -1,1)
circle(0,0, 1, "brown"); circle(0,0, 1/2, "brown"); BOX()
type(-1,1,"-1"); type(1,1,"1"); type(-1,-1,"-1"); type(1,-1,"1")
n = 0; ok = 0; M = 10000
for (i in 1:M) {x <- runif(1, -1,1); y <- runif(1, -1,1); d <- x*x+y*y
           if(d < 1){n <- n+1; Dot(x,y,"brown"); if(d < 0.25) ok <- ok+1}}
cat("n.throws ",n," %OK ",ok/n*100,"\n")
#
BF=3;HF=3; BOXW(-1,1, -1,1)
type(-1,1,"-1"); type(1,1,"1"); type(-1,-1,"-1"); type(1,-1,"1")
circle(0,0, 1, "brown"); circle(0,0, 1/2, "brown"); BOX()
ok = 0; n = 10000
for(i in 1:n) {r <- runif(1); a <- runif(1, 0,pi*2); x <- r*cos(a); y <- r*sin(a)
               d <- x*x+y*y; Dot(x,y,"brown"); if(d < 0.25) ok <- ok+1}
cat("n.throws ",n," %OK ",ok/n*100,"\n")

          
n.throws  7851  %OK  25.11782  (-> 1/4)    n.throws  10000  %OK  49.7  (-> 1/2)
# The three-dimensional representation (how to get it):
  
#                      ---------- ---------- ---------- ----------

# Here we saw how to use arrays. We go deep into the topic.
z = c(1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4)
t = array(z,dim=c(6,4)); t
     [,1] [,2] [,3] [,4]
[1,]    1    7    3    9
[2,]    2    8    4    0
[3,]    3    9    5    1
[4,]    4    0    6    2
[5,]    5    1    7    3
[6,]    6    2    8    4
# To see the table in a different way:
data.entry(t)
              
# We can also use data.entry to insert data:
t = array(dim=c(6,4)); data.entry(t)
              
# I insert data into the cells. Then I click [X] and:
t
      var1 var2 var3 var4
[1,]    1    7    3    9
[2,]    2    8    4    0
[3,]    3    9    5    1
[4,]    4    0    6    2
[5,]    5    1    7    3
[6,]    6    2    8    4
length(t); colMeans(t);rowMeans(t)
[1] 24
[1] 3.500000 4.500000 5.500000 3.166667
[1] 5.0 3.5 4.5 3.0 4.0 5.0
#
# An analysis of the columns
summary(t)
       V1             V2             V3             V4       
 Min.   :1.00   Min.   :0.00   Min.   :3.00   Min.   :0.000  
 1st Qu.:2.25   1st Qu.:1.25   1st Qu.:4.25   1st Qu.:1.250  
 Median :3.50   Median :4.50   Median :5.50   Median :2.500  
 Mean   :3.50   Mean   :4.50   Mean   :5.50   Mean   :3.167  
 3rd Qu.:4.75   3rd Qu.:7.75   3rd Qu.:6.75   3rd Qu.:3.750  
 Max.   :6.00   Max.   :9.00   Max.   :8.00   Max.   :9.000
# how to analyze the data in the first column
a = t[,1]; a
[1] 1 2 3 4 5 6
summary(a)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00    2.25    3.50    3.50    4.75    6.00
# and in the 6th row
b = t[6,]; b; summary(b)
[1] 6 2 8 4
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    2.0     3.5     5.0     5.0     6.5     8.0

#                      ---------- ---------- ---------- ----------

# See here for the histograms 3D (hist3d) and for the mosaic graphs (and for table).
 
      [,1] [,2]               males females
 [1,]  603  246     primary    2.63    1.07          The distribution of 2nd col
 [2,] 5051 1311     secondary 22.06    5.73          primary secondary  tertiary
 [3,] 7787 7901     tertiary  34.01   34.50            2.6      13.9      83.5
 
             

#                      ---------- ---------- ---------- ----------

# The program can analyze almost any stored data. We see an example where we use the
# readLines command to analyze the files and the read.table command to load them:
# [you can run help(read.table) for details;  with n=… I avoid the possibility of
#  filling the screen with many data;  see here to use scan to load data not
#  presented in a table]
readLines("http://macosa.dima.unige.it/R/pulse.txt",n=11)
 [1] "# Survey of students in a university course, from the MiniTab manual" 
 [2] "# Heartbeats before an eventual 1 minute run"                         
 [3] "# Heartbeats after"                                                   
 [4] "# Did the student run? (1 yes; 0 no; according to the cast of a coin)"
 [5] "# Is the student a smoker? (1 yes; 0 no)"                             
 [6] "# the sex (1 M; 2 F)"                                                 
 [7] "# height"                                                             
 [8] "# weight"                                                             
 [9] "# physical activity (0 nothing; 1 little; 2 middling; 3 a lot)"       
[10] "Num,BeatBef,BeatAft,Run,Smoke,Sex,Heig,Weig,Phis"                     
[11] "01,64,88,1,0,1,168,64,2"
# In read.table I put header=FALSE to indicate that there is no column header that
# contains the names of the variables, skip=10 to indicate how many lines to skip
# before beginning to read data, sep="," to indicate the character that separes the
# data:
da= read.table("http://macosa.dima.unige.it/R/pulse.txt",header=FALSE,skip=10,sep =",")
da
     V1  V2  V3  V4 V5 V6 V7 V8 V9
1    1   64  88  1  0  1 168 64  2
2    2   58  70  1  0  1 183 66  2
3    3   62  76  1  1  1 186 73  3
4    4   66  78  1  1  1 184 86  1
5    5   64  80  1  0  1 176 70  2
6    6   74  84  1  0  1 184 75  1
7    7   84  84  1  0  1 184 68  3
...
89   89  78  80  0  0  2 173 60  1
90   90  68  68  0  0  2 159 50  2
91   91  86  84  0  0  2 171 68  3
92   92  76  76  0  0  2 156 49  2
# To understand what we are analyzing we use str:
str(da)
  'data.frame':   92 obs. of  8 variables:
  $ V1: int  1 2 3 4 5 6 7 8 9 10 ...
  $ V2: int  64 58 62 66 64 74 84 68 62 76 ...
  $ V3: int  88 70 76 78 80 84 84 72 75 118 ...
  $ V4: int  1 1 1 1 1 1 1 1 1 1 ...
  $ V5: int  0 0 1 1 0 0 0 0 0 0 ...
  $ V6: int  1 1 1 1 1 1 1 1 1 1 ...
  $ V7: int  168 183 186 184 176 184 184 188 184 181 ...
  $ V8: int  64 66 73 86 70 75 68 86 88 63 ...
  $ V9: int  2 2 3 1 2 1 3 2 2 2 ...
# The columns, without headers, are automatically headed (V1, … , V9)
summary(da[2:9])
       V2               V3            V4               V5        
 Min.   : 48.00   Min.   : 50   Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 64.00   1st Qu.: 68   1st Qu.:0.0000   1st Qu.:0.0000  
 Median : 71.00   Median : 76   Median :0.0000   Median :0.0000  
 Mean   : 72.87   Mean   : 80   Mean   :0.3804   Mean   :0.3043  
 3rd Qu.: 80.00   3rd Qu.: 85   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :100.00   Max.   :140   Max.   :1.0000   Max.   :1.0000  
       V6              V7              V8              V9       
 Min.   :1.000   Min.   :154.0   Min.   :43.00   Min.   :0.000  
 1st Qu.:1.000   1st Qu.:167.8   1st Qu.:57.00   1st Qu.:2.000  
 Median :1.000   Median :175.0   Median :66.00   Median :2.000  
 Mean   :1.380   Mean   :174.4   Mean   :65.84   Mean   :2.109  
 3rd Qu.:2.000   3rd Qu.:183.0   3rd Qu.:70.25   3rd Qu.:2.000  
 Max.   :2.000   Max.   :190.0   Max.   :97.00   Max.   :3.000
# I can examine a single variable (the heartbeats at rest):
bb = da$V2; summary(bb)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  48.00   64.00   71.00   72.87   80.00  100.00
stem(bb)
  The decimal point is 1 digit(s) to the right of the |
   4 | 8
   5 | 44888
   6 | 0000122222222244446666688888888888
   7 | 000000222222444446666688888
   8 | 000222444467888
   9 | 000022466
  10 | 0
#
# If I want, I can use headers (before the data). In this case they already are: Num, …
dat= read.table("http://macosa.dima.unige.it/R/pulse.txt",header=TRUE,skip=9,sep =",")
str(dat)
'data.frame':   92 obs. of  9 variables:
 $ Num    : int  1 2 3 4 5 6 7 8 9 10 ...
 $ BeatBef: int  64 58 62 66 64 74 84 68 62 76 ...
 $ BeatAft: int  88 70 76 78 80 84 84 72 75 118 ...
 $ Run    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Smoke  : int  0 0 1 1 0 0 0 0 0 0 ...
 $ Sex    : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Heig   : int  168 183 186 184 176 184 184 188 184 181 ...
 $ Weig   : int  64 66 73 86 70 75 68 86 88 63 ...
 $ Phis   : int  2 2 3 1 2 1 3 2 2 2 ...
summary(dat$BeatBef)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  48.00   64.00   71.00   72.87   80.00  100.00 
#
# I can extract the data (of heartbeats at rest) of males or females:
hm = subset(bb,da$V6==1); hf = subset(bb,da$V6==2)
# Their histograms in the same scale:
dev.new(); hist(hm,right=FALSE,probability=TRUE,border="blue",col="cyan",
                 xlim=c(45,100),ylim=c(0,0.055)); BOX()   # With BOX I add a grid
dev.new(); hist(hf,right=FALSE,probability=TRUE,border="brown",col="pink",
                 xlim=c(45,100),ylim=c(0,0.055)); BOX()

#
# How to determine the coefficient of correlation rX,Y between two variables X and Y?

             
# Some examples (rounding to 1st digit after "."):
    
# We can explicitly define:
COR = function(X,Y) mean((X-mean(X))*(Y-mean(Y)))/(Sd(X)*Sd(Y))
                        [ see here ]
# Example of use:
h = da$V7; w = da$V8; COR(h,w) # correlation between heigth and weight
[1] 0.7826331
s = da$V5; p = da$V9; COR(s,p) # correlation between smoking and physical activity
[1] -0.1202018
# Or we can use the function cor. For example we can type cor(da)  or:
print( cor(da[2:9]), 3)
        V2      V3       V4      V5     V6      V7       V8       V9
V2  1.0000  0.6162  0.05228  0.1287  0.285 -0.2110 -0.20322 -0.06257
V3  0.6162  1.0000  0.57682  0.0459  0.309 -0.1533 -0.16637 -0.14110
V4  0.0523  0.5768  1.00000  0.0656 -0.107  0.2236  0.22402  0.00732
V5  0.1287  0.0459  0.06558  1.0000 -0.129  0.0428  0.20062 -0.12020
V6  0.2854  0.3095 -0.10677 -0.1290  1.000 -0.709  -0.71023 -0.10497
V7 -0.2110 -0.1533  0.22360  0.0428 -0.709  1.0000  0.78263  0.08930
V8 -0.2032 -0.1664  0.22402  0.2006 -0.710  0.7826  1.00000 -0.00404
V9 -0.0626 -0.1411  0.00732 -0.1202 -0.105  0.0893 -0.00404  1.00000
# or:
round( cor(da[2:9]), 3)
       V2     V3     V4     V5     V6     V7     V8     V9
V2  1.000  0.616  0.052  0.129  0.285 -0.211 -0.203 -0.063
V3  0.616  1.000  0.577  0.046  0.309 -0.153 -0.166 -0.141
V4  0.052  0.577  1.000  0.066 -0.107  0.224  0.224  0.007
V5  0.129  0.046  0.066  1.000 -0.129  0.043  0.201 -0.120
V6  0.285  0.309 -0.107 -0.129  1.000 -0.709 -0.710 -0.105
V7 -0.211 -0.153  0.224  0.043 -0.709  1.000  0.783  0.089
V8 -0.203 -0.166  0.224  0.201 -0.710  0.783  1.000 -0.004
V9 -0.063 -0.141  0.007 -0.120 -0.105  0.089 -0.004  1.000
#
# The statistical correlation is not a substantial correlation!
# For example, let's examine separately the statistical correlation between weight and
# height for males and females:
M = subset(da,da$V6==1); F = subset(da,da$V6==2)
cor(M[7],M[8]); cor(F[7],F[8])
          V8               V8
V7 0.5904648     V7 0.5191614
# They are both much less than 0.783!
# We can better understand the phenomenon with a graphic representation:
Plane(min(da[7]),max(da[7]),min(da[8]),max(da[8])); abovex("height"); abovey("weight")
POINT(F[,7],F[,8],"red"); Point(M[,7],M[,8],"blue")

                  
# The problem is due to the presence of two subpopulations with different
# characteristics.
# How to associate a precision with cor? See down.

#                      ---------- ---------- ---------- ----------

# To study the relationship between a dependent variable y and an independent variable
# x we can use linear regression, ie (if (x1,y2),…,(xN,yN) are the experimental points)
# we can find the function F: x->ax+b such that (F(x1)-y1)²+…+(F(xN)-yN)² is minimum
# For instance we examine the same data (see).
H = da[,7]; W = da[,8]
Plane(min(H),max(H),min(W),max(W)); abovex("height"); abovey("weight")
Point(H,W,"brown")
LR(H,W)                     # You obtain A and B, if y=A+Bx is the line
[1] -90.8681981   0.8983596
f = function(x) LR(H,W)[1]+LR(H,W)[2]*x; graph1(f,150,200,"blue")
# or:    abline(LR(H,W), col="blue")
# The line passes through the centroid (see the picture in the center)
POINT(mean(H),mean(W),"blue")



# We can consider the linear regression constrained to pass through (0,0) or through
# another point: it is the same problem with some constrains to F. The picture on the
# right illustrates two cases.
# Also for this linear regression the line passes through the centroid.
Plane(0,max(H),0,max(W)); abovex("height"); abovey("weight")
Point(H,W,"brown"); graph1(f,0,200,"blue"); POINT(mean(H),mean(W),"blue")
POINT(0,0,"black"); LR0(H,W,0,0)        # You obtain A and B, if y=A+Bx is the line
0   0.3789037  <- LR01  LR02   y=LR01+LR02*x
# [ We note that I can obtain 0.3789037 also by:  mean(H*W)/mean(H^2) ]
A=LR01; B=LR02; g = function(x) A + B*x; graph1(g,0,200,"seagreen")
POINT(50,40,"black"); LR0(H,W,50,40)
29.42683   0.2114634  <- LR01  LR02   y=LR01+LR02*x
h = function(x) LR01+LR02*x; graph1(h,0,200,"red")
# How to associate a precision with LR and LR0? See down.
# We can compute linear regression with  regression and regression1 too (see).

#                      ---------- ---------- ---------- ----------

# A more significant example.
readLines("http://macosa.dima.unige.it/R/callup.txt",n=11)
 [1] "# Medical examination for call-up (Italian Navy - 1987 - first contingent)"
 [2] "#   (some chest measurements are unreliable; pay attention to that)"       
 [3] "# chest (truncated to cm)"
 [4] "# weight (truncated to kg)"
 [5] "# region (numbered on the basis of latitude of the chief town: 0ao 1tr 2fr"
 [6] "#    3lomb 4ven 5piem 6em-rom 7lig 8tosc 9mar 10um 11abr 12laz 13pugl"
 [7] "#    14camp 15sard 16cal 17sic; the other regions are not present)"
 [8] "# height (truncated to cm and decreased by 100: 178 cm -> 78)"
 [9] "chest; weight; region; height" 
[10] "91;62;15;73" 
[11] "86;64;15;70"
da <- read.table("http://macosa.dima.unige.it/R/callup.txt",skip=8, sep=";",header=TRUE)
str(da)
'data.frame':   4170 obs. of  4 variables:
 $ chest : int  91 86 89 88 92 95 91 83 87 88 ...
 $ weight: int  62 64 64 68 70 77 65 62 55 60 ...
 $ region: int  15 15 15 15 15 15 15 15 15 15 ...
 $ height: int  73 70 74 79 66 69 66 79 75 65 ...
# If I want to visualize the data in a different way:
# data.entry(da)
                 
summary(da)
     chest            weight           region          height     
 Min.   : 48.00   Min.   : 48.00   Min.   : 0.00   Min.   :47.00  
 1st Qu.: 86.00   1st Qu.: 63.00   1st Qu.: 7.00   1st Qu.:70.00  
 Median : 90.00   Median : 70.00   Median :12.00   Median :74.00  
 Mean   : 91.19   Mean   : 70.48   Mean   :10.18   Mean   :74.42  
 3rd Qu.: 95.00   3rd Qu.: 76.00   3rd Qu.:14.00   3rd Qu.:79.00  
 Max.   :179.00   Max.   :126.00   Max.   :17.00   Max.   :96.00
# We add 1/2 to truncated data, so we do not have to add 1/2 to the average values
# we get
da$chest <- da$chest+1/2; da$weight <- da$weight+1/2; da$height <- da$height+1/2
summary(da)
     chest            weight           region          height     
 Min.   : 48.50   Min.   : 48.50   Min.   : 0.00   Min.   :47.50  
 1st Qu.: 86.50   1st Qu.: 63.50   1st Qu.: 7.00   1st Qu.:70.50  
 Median : 90.50   Median : 70.50   Median :12.00   Median :74.50  
 Mean   : 91.69   Mean   : 70.98   Mean   :10.18   Mean   :74.92  
 3rd Qu.: 95.50   3rd Qu.: 76.50   3rd Qu.:14.00   3rd Qu.:79.50  
 Max.   :179.50   Max.   :126.50   Max.   :17.00   Max.   :96.50
round( cor(da), 3)
       chest weight region height
chest  1.000  0.697  0.016  0.255
weight 0.697  1.000  0.023  0.458
region 0.016  0.023  1.000 -0.152
height 0.255  0.458 -0.152  1.000
#
# Let's shorten the names, to study the regression:
C = da$chest; W = da$weight; R = da$region; H = da$height
Plane(min(C)-5,max(C)+5, min(W)-5,max(W)+5); Point(C,W,"magenta")
LR(C,W)
[1] -15.300034   0.940987
f = function(x) LR(C,W)[1]+LR(C,W)[2]*x; graph1(f,50,200,"black")
# Let's study the inverse relationship, too:
LR(W,C)
[1] 54.9919381  0.5170087
g = function(y) LR(W,C)[1]+LR(W,C)[2]*y
# We must represent the inverse relation (x <-> y).
# We represent it as CURVE instead of function
G = function(x,y) x - g(y); CUR(G,"seagreen")
POINT(mean(C),mean(W),"blue")

 

# If we have a large numbers of points we can use more or less dense colors according
# to the frequency. We have two chances.  The first (picture in the middle):
Plane(min(C)-5,max(C)+5, min(W)-5,max(W)+5)
Point(C,W, Dcol(C,W) )
graph1(f,50,200,"black"); CUR(G,"brown"); POINT(mean(C),mean(W),"red")
# The second (picture on the right)
Plane(min(C)-5,max(C)+5, min(W)-5,max(W)+5)
Point(C,W, DCOL(C,W) )
graph1(f,50,200,"black"); CUR(G,"brown")
#
# Let's see the histograms of height and weight, too:
dev.new(); hist(H,right=FALSE,probability=TRUE,col="grey");BOX()
dev.new(); hist(W,right=FALSE,probability=TRUE,col="grey");BOX()



# The histogram of chest is hampered by not reliable data.
# Well, I cut off the tails:
dev.new(); hist(C,right=FALSE,probability=TRUE,col="grey");BOX()
C2=0; for(i in 1:4140) C2[i]=C1[i+15]
dev.new(); hist(C2,nclass=15,right=FALSE,probability=TRUE,col="grey");BOX()


#                      ---------- ---------- ---------- ----------

# QQ plot (quantiles against quantiles plot) is an instrument sometimes employed to
# compare distributions.
# An example of use (referring to the previuos data).
HP=0; for(i in 1:19) HP[i] = Percentile(H,i*5)
WP=0; for(i in 1:19) WP[i] = Percentile(W,i*5)
CP=0; for(i in 1:19) CP[i] = Percentile(C,i*5)
# Some percentiles of the variables and their graphic comparison:
PlaneP(HP,WP); Point(HP,WP,"brown"); abovex("H");abovey("W")
PlaneP(HP,CP); Point(HP,CP,"brown"); abovex("H");abovey("C")
PlaneP(WP,CP); Point(WP,CP,"brown"); abovex("W");abovey("C")

  

# As we have seen (better) comparing the histograms, I find that W and C have a
# similar distribution.

#                      ---------- ---------- ---------- ----------

# Still something about the study of the relation between two variables.


#  I hang different
#  objects to an elastic.
#  Let F be the weight (in   
#  g) of the objects, let
#  H be the elongation
#  (in mm) of the elastic.
  H |  F
 15 | 260
 20 | 380
 39 | 710
 52 | 990   
   
# If I know that the values of H are ±2 and the values of F are ±5 I can proceed as we # saw here: H = c(15,20,39,52); F = c(260,380,710,990); eH = rep(2,4); eF = rep(5,4) Plane(0,60, 0,1000); pointDif(0,0, H,F, eH,eF) # 18.24074 * x # 19.32432 * x # F = k·H, 18.240 ≤ k ≤ 19.325, or k = 18.8±0.6 # # If I don't know the precision of the measurements I can proceed as we saw just above: Plane(0,60, 0,1000); POINT(H,F, "brown"); abovex("H"); abovey("F") LR(H,F) # if I don't know that for H=0 F=0 # -21.40182 19.25085 f = function(x) LR(H,F)[1]+LR(H,F)[2]*x; graph1(f,0,60,"red") LR0(H,F, 0,0) # if I know that for H=0 F=0 # 0 18.69485 <- LR01 LR02 y=LR01+LR02*x g = function(x) LR02*x; graph1(g,0,60,"seagreen") # To evaluate the goodness of approximation obtained with the regression line, I # proceed as follows, in the first case (LR): (F function of H; first F, then H) confInt(F,H, 90/100) # 5 % 95 % # (Intercept) -90.15108 47.34745 # Conf 17.27649 21.22522 # In the second case (LR0): confInt0(F,H, 0,0, 90/100) # 5 % 95 % # Conf 18.03658 19.35311 (quite similar to 18.240 19.325) # These are the confidence intervals at the 90% (0.9) significance level of the slope # and (in the first case) of the intercept of the line. I can change the level. # H and F are strongly correlated (see above): cor(H,F) # 0.9987686 (about 1) # To value the efficacy of the correlation we determine a confidence interval cor.test(H,F, conf.level=0.90) # 90 percent confidence interval: # 0.9674734 0.9999541 (comments on the other outputs) # Another correlation coefficient often used is "Sperman's". It is indicated with ρ (the # Greek letter "ro", which corresponds to our R) in that it compares the "ranks" of the # data, ie not their values but their order number (1st, 2nd, ...). See here. An example: x = c(10,20,30,40); y = c(15,60,70,200) cor(x,y); cor(x,y, method="spearman") # 0.9173057 1 # The data is arranged in an increasing way, but not aligned. The usual correlation # coefficient is close to 1. The Spearman coefficient is instead just 1, indicating that # the data is increasing, beyond the way they grow. Another example: x = c(10,20,30,40,50); y = c(1,2,-900,3,4) cor(x,y); cor(x,y, method="spearman") # 0.002742232 0.7 # ---------- ---------- ---------- ---------- # For linear, quadratic and cubic regression and for spline see here # ---------- ---------- ---------- ---------- # The coefficient of correlation can be found also for other type of function. An example. # We can define: COR2 = function(X,Y,f) {Sr=sum((y-f(x))^2);Sm=sum((Y-mean(Y))^2);sqrt((Sm-Sr)/Sm)} # Or we can use the already defined function cor2 # We want compare the following data with the quadratic regression x <- c(0,1,2,3,4,5); y <- c(2.1,7.7,13.6,27.2,40.9,61.1) Plane(-1,6, 0,70); POINT(x,y, "blue") regression2(x,y) # 1.86 * x^2 + 2.36 * x + 2.48 f=function(x) 1.86 * x^2 + 2.36 * x + 2.48; graph1(f,-1,6, "black") cor2(x,y,f) # 0.9992544 A value near to 1! With linear regression I have: cor(x,y) # 0.9731813 # I can obtain the same value with cor2 if I put this function: regression1(x,y) # 11.66 * x + -3.724 g = function(x) 11.66 * x + -3.724; graph1(g,-1,6, "red") cor2(x,y,g) # 0.9731812 # But math is not enough! In the case of the picture on the right: x <- c(0,1,2,3,4,5); y <- c(2,19,38,15,48,43) Plane(-1,6, 0,50); POINT(x,y,"black") regression1(x,y) # 7.686 * x + 8.286 f1=function(x) 7.686 * x + 8.286; graph1(f1,-1,6, "blue") regression2(x,y) # -0.964 * x^2 + 12.5 * x + 5.07 f2=function(x) -0.964 * x^2 + 12.5 * x + 5.07; graph1(f2,-1,6, "brown") regression3(x,y) # 0.87 * x^3 + -7.49 * x^2 + 24.4 * x + 2.46 f3=function(x) 0.87*x^3-7.49*x^2+24.4*x+2.46; graph1(f3,-1,6, "red") cor2(x,y,f1); cor2(x,y,f2); cor2(x,y,f3) # 0.7916388 0.8048205 0.8230846 # The cubic function gives a better correlation. But the type of function (a polynomial # or other function) with which I can approximate the data depends on the context.
# For example, if I look for a function of the type
# x -> sin(U*x)+V*x  I find  x -> sin(3.58*x)+9.98*x
# (see here). I obtain a different correlation:
f=function(x) sin(3.58*x)+9.98*x; cor2(x,y,f)
# 0.768965
# If I look for a function of the type x -> U*x I find:
k=function(x) 9.94545*x; cor2(x,y, k)
# 0.7397531
            
 
     
# ---------- ---------- ---------- ---------- # Binomial and Gaussian distributions, Continuous case, Central limit theorem # Exponential and Poisson distributions # Test Other examples of use