---------- ---------- ---------- ---------- ---------- ---------- ---------- ----------
# [ source("http://macosa.dima.unige.it/r.R") ]
# We refer to the introduction to R language here, here and here for the bases.
# We refer to WikiPedia (the English version only!) to go deep into Statistics
# ---------- ---------- ---------- ----------
# To have numerical indicators of asimmetry (skewness) of a data distribution see here.
# ---------- ---------- ---------- ----------
# What does "to take randomly" mean? It depends on the law of distribution with which
# it is done. Two processes that generate a point falling into the circle with center O
# and radius 1 and test if its distance from O is less than 1/2.
BF=3;HF=3; BOXW(-1,1, -1,1)
circle(0,0, 1, "brown"); circle(0,0, 1/2, "brown"); BOX()
type(-1,1,"-1"); type(1,1,"1"); type(-1,-1,"-1"); type(1,-1,"1")
n = 0; ok = 0; M = 10000
for (i in 1:M) {x <- runif(1, -1,1); y <- runif(1, -1,1); d <- x*x+y*y
if(d < 1){n <- n+1; Dot(x,y,"brown"); if(d < 0.25) ok <- ok+1}}
cat("n.throws ",n," %OK ",ok/n*100,"\n")
#
BF=3;HF=3; BOXW(-1,1, -1,1)
type(-1,1,"-1"); type(1,1,"1"); type(-1,-1,"-1"); type(1,-1,"1")
circle(0,0, 1, "brown"); circle(0,0, 1/2, "brown"); BOX()
ok = 0; n = 10000
for(i in 1:n) {r <- runif(1); a <- runif(1, 0,pi*2); x <- r*cos(a); y <- r*sin(a)
d <- x*x+y*y; Dot(x,y,"brown"); if(d < 0.25) ok <- ok+1}
cat("n.throws ",n," %OK ",ok/n*100,"\n")
n.throws 7851 %OK 25.11782 (-> 1/4) n.throws 10000 %OK 49.7 (-> 1/2)
# The three-dimensional representation (how to get it):
# ---------- ---------- ---------- ----------
# Here we saw how to use arrays. We go deep into the topic.
z = c(1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4)
t = array(z,dim=c(6,4)); t
[,1] [,2] [,3] [,4]
[1,] 1 7 3 9
[2,] 2 8 4 0
[3,] 3 9 5 1
[4,] 4 0 6 2
[5,] 5 1 7 3
[6,] 6 2 8 4
# To see the table in a different way:
data.entry(t)
# We can also use data.entry to insert data:
t = array(dim=c(6,4)); data.entry(t)
# I insert data into the cells. Then I click [X] and:
t
var1 var2 var3 var4
[1,] 1 7 3 9
[2,] 2 8 4 0
[3,] 3 9 5 1
[4,] 4 0 6 2
[5,] 5 1 7 3
[6,] 6 2 8 4
length(t); colMeans(t);rowMeans(t)
[1] 24
[1] 3.500000 4.500000 5.500000 3.166667
[1] 5.0 3.5 4.5 3.0 4.0 5.0
#
# An analysis of the columns
summary(t)
V1 V2 V3 V4
Min. :1.00 Min. :0.00 Min. :3.00 Min. :0.000
1st Qu.:2.25 1st Qu.:1.25 1st Qu.:4.25 1st Qu.:1.250
Median :3.50 Median :4.50 Median :5.50 Median :2.500
Mean :3.50 Mean :4.50 Mean :5.50 Mean :3.167
3rd Qu.:4.75 3rd Qu.:7.75 3rd Qu.:6.75 3rd Qu.:3.750
Max. :6.00 Max. :9.00 Max. :8.00 Max. :9.000
# how to analyze the data in the first column
a = t[,1]; a
[1] 1 2 3 4 5 6
summary(a)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 2.25 3.50 3.50 4.75 6.00
# and in the 6th row
b = t[6,]; b; summary(b)
[1] 6 2 8 4
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.0 3.5 5.0 5.0 6.5 8.0
# ---------- ---------- ---------- ----------
# See here for the histograms 3D (hist3d) and for the mosaic graphs (and for table).
[,1] [,2] males females
[1,] 603 246 primary 2.63 1.07 The distribution of 2nd col
[2,] 5051 1311 secondary 22.06 5.73 primary secondary tertiary
[3,] 7787 7901 tertiary 34.01 34.50 2.6 13.9 83.5
# ---------- ---------- ---------- ----------
# The program can analyze almost any stored data. We see an example where we use the
# readLines command to analyze the files and the read.table command to load them:
# [you can run help(read.table) for details; with n=
I avoid the possibility of
# filling the screen with many data; see here to use scan to load data not
# presented in a table]
readLines("http://macosa.dima.unige.it/R/pulse.txt",n=11)
[1] "# Survey of students in a university course, from the MiniTab manual"
[2] "# Heartbeats before an eventual 1 minute run"
[3] "# Heartbeats after"
[4] "# Did the student run? (1 yes; 0 no; according to the cast of a coin)"
[5] "# Is the student a smoker? (1 yes; 0 no)"
[6] "# the sex (1 M; 2 F)"
[7] "# height"
[8] "# weight"
[9] "# physical activity (0 nothing; 1 little; 2 middling; 3 a lot)"
[10] "Num,BeatBef,BeatAft,Run,Smoke,Sex,Heig,Weig,Phis"
[11] "01,64,88,1,0,1,168,64,2"
# In read.table I put header=FALSE to indicate that there is no column header that
# contains the names of the variables, skip=10 to indicate how many lines to skip
# before beginning to read data, sep="," to indicate the character that separes the
# data:
da= read.table("http://macosa.dima.unige.it/R/pulse.txt",header=FALSE,skip=10,sep =",")
da
V1 V2 V3 V4 V5 V6 V7 V8 V9
1 1 64 88 1 0 1 168 64 2
2 2 58 70 1 0 1 183 66 2
3 3 62 76 1 1 1 186 73 3
4 4 66 78 1 1 1 184 86 1
5 5 64 80 1 0 1 176 70 2
6 6 74 84 1 0 1 184 75 1
7 7 84 84 1 0 1 184 68 3
...
89 89 78 80 0 0 2 173 60 1
90 90 68 68 0 0 2 159 50 2
91 91 86 84 0 0 2 171 68 3
92 92 76 76 0 0 2 156 49 2
# To understand what we are analyzing we use str:
str(da)
'data.frame': 92 obs. of 8 variables:
$ V1: int 1 2 3 4 5 6 7 8 9 10 ...
$ V2: int 64 58 62 66 64 74 84 68 62 76 ...
$ V3: int 88 70 76 78 80 84 84 72 75 118 ...
$ V4: int 1 1 1 1 1 1 1 1 1 1 ...
$ V5: int 0 0 1 1 0 0 0 0 0 0 ...
$ V6: int 1 1 1 1 1 1 1 1 1 1 ...
$ V7: int 168 183 186 184 176 184 184 188 184 181 ...
$ V8: int 64 66 73 86 70 75 68 86 88 63 ...
$ V9: int 2 2 3 1 2 1 3 2 2 2 ...
# The columns, without headers, are automatically headed (V1,
, V9)
summary(da[2:9])
V2 V3 V4 V5
Min. : 48.00 Min. : 50 Min. :0.0000 Min. :0.0000
1st Qu.: 64.00 1st Qu.: 68 1st Qu.:0.0000 1st Qu.:0.0000
Median : 71.00 Median : 76 Median :0.0000 Median :0.0000
Mean : 72.87 Mean : 80 Mean :0.3804 Mean :0.3043
3rd Qu.: 80.00 3rd Qu.: 85 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :100.00 Max. :140 Max. :1.0000 Max. :1.0000
V6 V7 V8 V9
Min. :1.000 Min. :154.0 Min. :43.00 Min. :0.000
1st Qu.:1.000 1st Qu.:167.8 1st Qu.:57.00 1st Qu.:2.000
Median :1.000 Median :175.0 Median :66.00 Median :2.000
Mean :1.380 Mean :174.4 Mean :65.84 Mean :2.109
3rd Qu.:2.000 3rd Qu.:183.0 3rd Qu.:70.25 3rd Qu.:2.000
Max. :2.000 Max. :190.0 Max. :97.00 Max. :3.000
# I can examine a single variable (the heartbeats at rest):
bb = da$V2; summary(bb)
Min. 1st Qu. Median Mean 3rd Qu. Max.
48.00 64.00 71.00 72.87 80.00 100.00
stem(bb)
The decimal point is 1 digit(s) to the right of the |
4 | 8
5 | 44888
6 | 0000122222222244446666688888888888
7 | 000000222222444446666688888
8 | 000222444467888
9 | 000022466
10 | 0
#
# If I want, I can use headers (before the data). In this case they already are: Num,
dat= read.table("http://macosa.dima.unige.it/R/pulse.txt",header=TRUE,skip=9,sep =",")
str(dat)
'data.frame': 92 obs. of 9 variables:
$ Num : int 1 2 3 4 5 6 7 8 9 10 ...
$ BeatBef: int 64 58 62 66 64 74 84 68 62 76 ...
$ BeatAft: int 88 70 76 78 80 84 84 72 75 118 ...
$ Run : int 1 1 1 1 1 1 1 1 1 1 ...
$ Smoke : int 0 0 1 1 0 0 0 0 0 0 ...
$ Sex : int 1 1 1 1 1 1 1 1 1 1 ...
$ Heig : int 168 183 186 184 176 184 184 188 184 181 ...
$ Weig : int 64 66 73 86 70 75 68 86 88 63 ...
$ Phis : int 2 2 3 1 2 1 3 2 2 2 ...
summary(dat$BeatBef)
Min. 1st Qu. Median Mean 3rd Qu. Max.
48.00 64.00 71.00 72.87 80.00 100.00
#
# I can extract the data (of heartbeats at rest) of males or females:
hm = subset(bb,da$V6==1); hf = subset(bb,da$V6==2)
# Their histograms in the same scale:
dev.new(); hist(hm,right=FALSE,probability=TRUE,border="blue",col="cyan",
xlim=c(45,100),ylim=c(0,0.055)); BOX() # With BOX I add a grid
dev.new(); hist(hf,right=FALSE,probability=TRUE,border="brown",col="pink",
xlim=c(45,100),ylim=c(0,0.055)); BOX()
#
# How to determine the coefficient of correlation rX,Y between two variables X and Y?
# Some examples (rounding to 1st digit after "."):
# We can explicitly define:
COR = function(X,Y) mean((X-mean(X))*(Y-mean(Y)))/(Sd(X)*Sd(Y))
[ see here ]
# Example of use:
h = da$V7; w = da$V8; COR(h,w) # correlation between heigth and weight
[1] 0.7826331
s = da$V5; p = da$V9; COR(s,p) # correlation between smoking and physical activity
[1] -0.1202018
# Or we can use the function cor. For example we can type cor(da) or:
print( cor(da[2:9]), 3)
V2 V3 V4 V5 V6 V7 V8 V9
V2 1.0000 0.6162 0.05228 0.1287 0.285 -0.2110 -0.20322 -0.06257
V3 0.6162 1.0000 0.57682 0.0459 0.309 -0.1533 -0.16637 -0.14110
V4 0.0523 0.5768 1.00000 0.0656 -0.107 0.2236 0.22402 0.00732
V5 0.1287 0.0459 0.06558 1.0000 -0.129 0.0428 0.20062 -0.12020
V6 0.2854 0.3095 -0.10677 -0.1290 1.000 -0.709 -0.71023 -0.10497
V7 -0.2110 -0.1533 0.22360 0.0428 -0.709 1.0000 0.78263 0.08930
V8 -0.2032 -0.1664 0.22402 0.2006 -0.710 0.7826 1.00000 -0.00404
V9 -0.0626 -0.1411 0.00732 -0.1202 -0.105 0.0893 -0.00404 1.00000
# or:
round( cor(da[2:9]), 3)
V2 V3 V4 V5 V6 V7 V8 V9
V2 1.000 0.616 0.052 0.129 0.285 -0.211 -0.203 -0.063
V3 0.616 1.000 0.577 0.046 0.309 -0.153 -0.166 -0.141
V4 0.052 0.577 1.000 0.066 -0.107 0.224 0.224 0.007
V5 0.129 0.046 0.066 1.000 -0.129 0.043 0.201 -0.120
V6 0.285 0.309 -0.107 -0.129 1.000 -0.709 -0.710 -0.105
V7 -0.211 -0.153 0.224 0.043 -0.709 1.000 0.783 0.089
V8 -0.203 -0.166 0.224 0.201 -0.710 0.783 1.000 -0.004
V9 -0.063 -0.141 0.007 -0.120 -0.105 0.089 -0.004 1.000
#
# The statistical correlation is not a substantial correlation!
# For example, let's examine separately the statistical correlation between weight and
# height for males and females:
M = subset(da,da$V6==1); F = subset(da,da$V6==2)
cor(M[7],M[8]); cor(F[7],F[8])
V8 V8
V7 0.5904648 V7 0.5191614
# They are both much less than 0.783!
# We can better understand the phenomenon with a graphic representation:
Plane(min(da[7]),max(da[7]),min(da[8]),max(da[8])); abovex("height"); abovey("weight")
POINT(F[,7],F[,8],"red"); Point(M[,7],M[,8],"blue")
# The problem is due to the presence of two subpopulations with different
# characteristics.
# How to associate a precision with cor? See down.
# ---------- ---------- ---------- ----------
# To study the relationship between a dependent variable y and an independent variable
# x we can use linear regression, ie (if (x1,y2),
,(xN,yN) are the experimental points)
# we can find the function F: x->ax+b such that (F(x1)-y1)²+
+(F(xN)-yN)² is minimum
# For instance we examine the same data (see).
H = da[,7]; W = da[,8]
Plane(min(H),max(H),min(W),max(W)); abovex("height"); abovey("weight")
Point(H,W,"brown")
LR(H,W) # You obtain A and B, if y=A+Bx is the line
[1] -90.8681981 0.8983596
f = function(x) LR(H,W)[1]+LR(H,W)[2]*x; graph1(f,150,200,"blue")
# or: abline(LR(H,W), col="blue")
# The line passes through the centroid (see the picture in the center)
POINT(mean(H),mean(W),"blue")
# We can consider the linear regression constrained to pass through (0,0) or through
# another point: it is the same problem with some constrains to F. The picture on the
# right illustrates two cases.
# Also for this linear regression the line passes through the centroid.
Plane(0,max(H),0,max(W)); abovex("height"); abovey("weight")
Point(H,W,"brown"); graph1(f,0,200,"blue"); POINT(mean(H),mean(W),"blue")
POINT(0,0,"black"); LR0(H,W,0,0) # You obtain A and B, if y=A+Bx is the line
0 0.3789037 <- LR01 LR02 y=LR01+LR02*x
# [ We note that I can obtain 0.3789037 also by: mean(H*W)/mean(H^2) ]
A=LR01; B=LR02; g = function(x) A + B*x; graph1(g,0,200,"seagreen")
POINT(50,40,"black"); LR0(H,W,50,40)
29.42683 0.2114634 <- LR01 LR02 y=LR01+LR02*x
h = function(x) LR01+LR02*x; graph1(h,0,200,"red")
# How to associate a precision with LR and LR0? See down.
# We can compute linear regression with regression and regression1 too (see).
# ---------- ---------- ---------- ----------
# A more significant example.
readLines("http://macosa.dima.unige.it/R/callup.txt",n=11)
[1] "# Medical examination for call-up (Italian Navy - 1987 - first contingent)"
[2] "# (some chest measurements are unreliable; pay attention to that)"
[3] "# chest (truncated to cm)"
[4] "# weight (truncated to kg)"
[5] "# region (numbered on the basis of latitude of the chief town: 0ao 1tr 2fr"
[6] "# 3lomb 4ven 5piem 6em-rom 7lig 8tosc 9mar 10um 11abr 12laz 13pugl"
[7] "# 14camp 15sard 16cal 17sic; the other regions are not present)"
[8] "# height (truncated to cm and decreased by 100: 178 cm -> 78)"
[9] "chest; weight; region; height"
[10] "91;62;15;73"
[11] "86;64;15;70"
da <- read.table("http://macosa.dima.unige.it/R/callup.txt",skip=8, sep=";",header=TRUE)
str(da)
'data.frame': 4170 obs. of 4 variables:
$ chest : int 91 86 89 88 92 95 91 83 87 88 ...
$ weight: int 62 64 64 68 70 77 65 62 55 60 ...
$ region: int 15 15 15 15 15 15 15 15 15 15 ...
$ height: int 73 70 74 79 66 69 66 79 75 65 ...
# If I want to visualize the data in a different way:
# data.entry(da)
summary(da)
chest weight region height
Min. : 48.00 Min. : 48.00 Min. : 0.00 Min. :47.00
1st Qu.: 86.00 1st Qu.: 63.00 1st Qu.: 7.00 1st Qu.:70.00
Median : 90.00 Median : 70.00 Median :12.00 Median :74.00
Mean : 91.19 Mean : 70.48 Mean :10.18 Mean :74.42
3rd Qu.: 95.00 3rd Qu.: 76.00 3rd Qu.:14.00 3rd Qu.:79.00
Max. :179.00 Max. :126.00 Max. :17.00 Max. :96.00
# We add 1/2 to truncated data, so we do not have to add 1/2 to the average values
# we get
da$chest <- da$chest+1/2; da$weight <- da$weight+1/2; da$height <- da$height+1/2
summary(da)
chest weight region height
Min. : 48.50 Min. : 48.50 Min. : 0.00 Min. :47.50
1st Qu.: 86.50 1st Qu.: 63.50 1st Qu.: 7.00 1st Qu.:70.50
Median : 90.50 Median : 70.50 Median :12.00 Median :74.50
Mean : 91.69 Mean : 70.98 Mean :10.18 Mean :74.92
3rd Qu.: 95.50 3rd Qu.: 76.50 3rd Qu.:14.00 3rd Qu.:79.50
Max. :179.50 Max. :126.50 Max. :17.00 Max. :96.50
round( cor(da), 3)
chest weight region height
chest 1.000 0.697 0.016 0.255
weight 0.697 1.000 0.023 0.458
region 0.016 0.023 1.000 -0.152
height 0.255 0.458 -0.152 1.000
#
# Let's shorten the names, to study the regression:
C = da$chest; W = da$weight; R = da$region; H = da$height
Plane(min(C)-5,max(C)+5, min(W)-5,max(W)+5); Point(C,W,"magenta")
LR(C,W)
[1] -15.300034 0.940987
f = function(x) LR(C,W)[1]+LR(C,W)[2]*x; graph1(f,50,200,"black")
# Let's study the inverse relationship, too:
LR(W,C)
[1] 54.9919381 0.5170087
g = function(y) LR(W,C)[1]+LR(W,C)[2]*y
# We must represent the inverse relation (x <-> y).
# We represent it as CURVE instead of function
G = function(x,y) x - g(y); CUR(G,"seagreen")
POINT(mean(C),mean(W),"blue")
# If we have a large numbers of points we can use more or less dense colors according
# to the frequency. We have two chances. The first (picture in the middle):
Plane(min(C)-5,max(C)+5, min(W)-5,max(W)+5)
Point(C,W, Dcol(C,W) )
graph1(f,50,200,"black"); CUR(G,"brown"); POINT(mean(C),mean(W),"red")
# The second (picture on the right)
Plane(min(C)-5,max(C)+5, min(W)-5,max(W)+5)
Point(C,W, DCOL(C,W) )
graph1(f,50,200,"black"); CUR(G,"brown")
#
# Let's see the histograms of height and weight, too:
dev.new(); hist(H,right=FALSE,probability=TRUE,col="grey");BOX()
dev.new(); hist(W,right=FALSE,probability=TRUE,col="grey");BOX()
# The histogram of chest is hampered by not reliable data.
# Well, I cut off the tails:
dev.new(); hist(C,right=FALSE,probability=TRUE,col="grey");BOX()
C2=0; for(i in 1:4140) C2[i]=C1[i+15]
dev.new(); hist(C2,nclass=15,right=FALSE,probability=TRUE,col="grey");BOX()
# ---------- ---------- ---------- ----------
# QQ plot (quantiles against quantiles plot) is an instrument sometimes employed to
# compare distributions.
# An example of use (referring to the previuos data).
HP=0; for(i in 1:19) HP[i] = Percentile(H,i*5)
WP=0; for(i in 1:19) WP[i] = Percentile(W,i*5)
CP=0; for(i in 1:19) CP[i] = Percentile(C,i*5)
# Some percentiles of the variables and their graphic comparison:
PlaneP(HP,WP); Point(HP,WP,"brown"); abovex("H");abovey("W")
PlaneP(HP,CP); Point(HP,CP,"brown"); abovex("H");abovey("C")
PlaneP(WP,CP); Point(WP,CP,"brown"); abovex("W");abovey("C")
# As we have seen (better) comparing the histograms, I find that W and C have a
# similar distribution.
# ---------- ---------- ---------- ----------
# Still something about the study of the relation between two variables.
# I hang different
# objects to an elastic.
# Let F be the weight (in
# g) of the objects, let
# H be the elongation
# (in mm) of the elastic.
|
H | F
15 | 260
20 | 380
39 | 710
52 | 990 |  |
 |
# If I know that the values of H are ±2 and the values of F are ±5 I can proceed as we
# saw here:
H = c(15,20,39,52); F = c(260,380,710,990); eH = rep(2,4); eF = rep(5,4)
Plane(0,60, 0,1000); pointDif(0,0, H,F, eH,eF)
# 18.24074 * x
# 19.32432 * x
# F = k·H, 18.240 ≤ k ≤ 19.325, or k = 18.8±0.6
#
# If I don't know the precision of the measurements I can proceed as we saw just above:
Plane(0,60, 0,1000); POINT(H,F, "brown"); abovex("H"); abovey("F")
LR(H,F) # if I don't know that for H=0 F=0
# -21.40182 19.25085
f = function(x) LR(H,F)[1]+LR(H,F)[2]*x; graph1(f,0,60,"red")
LR0(H,F, 0,0) # if I know that for H=0 F=0
# 0 18.69485 <- LR01 LR02 y=LR01+LR02*x
g = function(x) LR02*x; graph1(g,0,60,"seagreen")
# To evaluate the goodness of approximation obtained with the regression line, I
# proceed as follows, in the first case (LR): (F function of H; first F, then H)
confInt(F,H, 90/100)
# 5 % 95 %
# (Intercept) -90.15108 47.34745
# Conf 17.27649 21.22522
# In the second case (LR0):
confInt0(F,H, 0,0, 90/100)
# 5 % 95 %
# Conf 18.03658 19.35311 (quite similar to 18.240 19.325)
# These are the confidence intervals at the 90% (0.9) significance level of the slope
# and (in the first case) of the intercept of the line. I can change the level.
# H and F are strongly correlated (see above):
cor(H,F)
# 0.9987686 (about 1)
# To value the efficacy of the correlation we determine a confidence interval
cor.test(H,F, conf.level=0.90)
# 90 percent confidence interval:
# 0.9674734 0.9999541 (comments on the other outputs)
# Another correlation coefficient often used is "Sperman's". It is indicated with ρ (the
# Greek letter "ro", which corresponds to our R) in that it compares the "ranks" of the
# data, ie not their values but their order number (1st, 2nd, ...). See here. An example:
x = c(10,20,30,40); y = c(15,60,70,200)
cor(x,y); cor(x,y, method="spearman")
# 0.9173057 1
# The data is arranged in an increasing way, but not aligned. The usual correlation
# coefficient is close to 1. The Spearman coefficient is instead just 1, indicating that
# the data is increasing, beyond the way they grow. Another example:
x = c(10,20,30,40,50); y = c(1,2,-900,3,4)
cor(x,y); cor(x,y, method="spearman")
# 0.002742232 0.7
# ---------- ---------- ---------- ----------
# For linear, quadratic and cubic regression and for spline see here
# ---------- ---------- ---------- ----------
# The coefficient of correlation can be found also for other type of function. An example.
# We can define:
COR2 = function(X,Y,f) {Sr=sum((y-f(x))^2);Sm=sum((Y-mean(Y))^2);sqrt((Sm-Sr)/Sm)}
# Or we can use the already defined function cor2
# We want compare the following data with the quadratic regression
x <- c(0,1,2,3,4,5); y <- c(2.1,7.7,13.6,27.2,40.9,61.1)
Plane(-1,6, 0,70); POINT(x,y, "blue")
regression2(x,y)
# 1.86 * x^2 + 2.36 * x + 2.48
f=function(x) 1.86 * x^2 + 2.36 * x + 2.48; graph1(f,-1,6, "black")
cor2(x,y,f)
# 0.9992544 A value near to 1! With linear regression I have:
cor(x,y)
# 0.9731813
# I can obtain the same value with cor2 if I put this function:
regression1(x,y)
# 11.66 * x + -3.724
g = function(x) 11.66 * x + -3.724; graph1(g,-1,6, "red")
cor2(x,y,g)
# 0.9731812
# But math is not enough! In the case of the picture on the right:
x <- c(0,1,2,3,4,5); y <- c(2,19,38,15,48,43)
Plane(-1,6, 0,50); POINT(x,y,"black")
regression1(x,y) # 7.686 * x + 8.286
f1=function(x) 7.686 * x + 8.286; graph1(f1,-1,6, "blue")
regression2(x,y) # -0.964 * x^2 + 12.5 * x + 5.07
f2=function(x) -0.964 * x^2 + 12.5 * x + 5.07; graph1(f2,-1,6, "brown")
regression3(x,y) # 0.87 * x^3 + -7.49 * x^2 + 24.4 * x + 2.46
f3=function(x) 0.87*x^3-7.49*x^2+24.4*x+2.46; graph1(f3,-1,6, "red")
cor2(x,y,f1); cor2(x,y,f2); cor2(x,y,f3)
# 0.7916388 0.8048205 0.8230846
# The cubic function gives a better correlation. But the type of function (a polynomial
# or other function) with which I can approximate the data depends on the context.
# For example, if I look for a function of the type
# x -> sin(U*x)+V*x I find x -> sin(3.58*x)+9.98*x
# (see here). I obtain a different correlation:
f=function(x) sin(3.58*x)+9.98*x; cor2(x,y,f)
# 0.768965
# If I look for a function of the type x -> U*x I find:
k=function(x) 9.94545*x; cor2(x,y, k)
# 0.7397531
 |
 |
# ---------- ---------- ---------- ----------
# Binomial and Gaussian distributions, Continuous case, Central limit theorem
# Exponential and Poisson distributions
# Test
Other examples of use