Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
231 views

Построение и анализ ЛММ с помощью R

Подготовка данных

f = open('data.tsv', 'r') s = f.read().replace(',', '.') f.close() fnew = open('d.tsv', 'w') fnew.write(s) fnew.close()
%r d <- read.table('d.tsv', header=TRUE, sep='\t') str(d) # Ok!
'data.frame': 25 obs. of 6 variables: $ X..з.п: int 1 2 3 4 5 6 7 8 9 10 ... $ Y : num 9.26 9.38 12.11 10.81 9.35 ... $ X2 : num 0.23 0.24 0.19 0.17 0.23 0.43 0.31 0.26 0.49 0.36 ... $ X3 : int 47750 50391 43149 41089 14257 22661 52509 14903 25587 16821 ... $ X4 : num 6.4 7.8 9.76 7.9 5.35 9.9 4.5 4.88 3.46 3.6 ... $ X5 : num 17.7 18.4 26.5 22.4 28.1 ...

Построение ЛММ

%r d.lm <- lm(Y ~ X2 + X3 + X4 + X5, data=d) summary(d.lm) yhat <- d.lm$fitted.values u <- d.lm$residuals mse <- mean(u^2) mape <- 100*mean(abs(u/d$Y)) print(paste("MSE =", mse)) print(paste("MAPE =", mape))
Call: lm(formula = Y ~ X2 + X3 + X4 + X5, data = d) Residuals: Min 1Q Median 3Q Max -2.08438 -0.28194 -0.06559 0.11921 3.10511 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.044e+01 1.695e+00 6.159 5.12e-06 *** X2 -1.475e+01 3.199e+00 -4.612 0.000169 *** X3 3.129e-05 1.324e-05 2.364 0.028323 * X4 1.983e-01 1.104e-01 1.796 0.087623 . X5 -8.625e-05 5.364e-02 -0.002 0.998733 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.131 on 20 degrees of freedom Multiple R-squared: 0.7628, Adjusted R-squared: 0.7154 F-statistic: 16.08 on 4 and 20 DF, p-value: 4.858e-06 [1] "MSE = 1.02381963571039" [1] "MAPE = 9.4044807390036"
%r par(mfrow = c(2, 2)) plot(d.lm) par(mfrow = c(1, 1))

Нормальность остатков

%r u <- d.lm$residuals # 1 shapiro.test(u) # 2 library(lattice) histogram(u) # 3 qqnorm(u) qqline(u, col="green")
Shapiro-Wilk normality test data: u W = 0.91782, p-value = 0.04571

Спецификация

%r # тест Рэмси library(lmtest) resettest(d.lm, power=2:4) # большое p-значение --> хорошо! # кол-во регрессоров + константа N = ncol(d) - 1 # длина выборки T = nrow(d) # функция Амемии af <- sum(u^2) * (T+N) / (T-N) print(paste("AF =", af))
RESET test data: d.lm RESET = 0.36365, df1 = 3, df2 = 17, p-value = 0.7801 [1] "AF = 38.3932363391395"

Стандартизированная модель

%r sd.lm <- lm(Y ~ X2 + X3 + X4 + X5, data=data.frame(scale(d))) summary(sd.lm)
Call: lm(formula = Y ~ X2 + X3 + X4 + X5, data = data.frame(scale(d))) Residuals: Min 1Q Median 3Q Max -0.98294 -0.13295 -0.03093 0.05622 1.46430 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.128e-16 1.067e-01 0.000 1.000000 X2 -5.979e-01 1.296e-01 -4.612 0.000169 *** X3 3.057e-01 1.293e-01 2.364 0.028323 * X4 2.022e-01 1.126e-01 1.796 0.087623 . X5 -1.765e-04 1.098e-01 -0.002 0.998733 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 0.5335 on 20 degrees of freedom Multiple R-squared: 0.7628, Adjusted R-squared: 0.7154 F-statistic: 16.08 on 4 and 20 DF, p-value: 4.858e-06

Эластичность

%r eps <- 0 beta <- d.lm$coefficients for (k in 2:5) { eps[k] <- beta[k] * mean(d[,k+1]) / mean(d[,2]) } eps
  1. 0
  2. -0.563269671575895
  3. 0.114740163997365
  4. 0.15484518174032
  5. -0.000209001087802315

Дисперсионно-ковариационная матрица коэффициентов

%r sigma.beta <- summary(d.lm)$cov.unscaled sigma.beta
(Intercept)X2X3X4X5
(Intercept) 2.245070e+00-2.8259646670-7.401786e-06-6.295829e-02-3.677625e-02
X2-2.825965e+00 7.9973895323 1.661980e-05 3.223947e-02-1.698824e-02
X3-7.401786e-06 0.0000166198 1.369017e-10-1.559626e-07-4.014967e-08
X4-6.295829e-02 0.0322394683-1.559626e-07 9.522466e-03-1.207146e-04
X5-3.677625e-02-0.0169882391-4.014967e-08-1.207146e-04 2.248496e-03
%r r.beta <- summary(d.lm, correlation=TRUE)$correlation r.beta
(Intercept)X2X3X4X5
(Intercept) 1.0000000 -0.6669261 -0.42219897-0.43058937-0.51761458
X2-0.6669261 1.0000000 0.50228152 0.11682587-0.12668587
X3-0.4221990 0.5022815 1.00000000-0.13659703-0.07236549
X4-0.4305894 0.1168259 -0.13659703 1.00000000-0.02608789
X5-0.5176146 -0.1266859 -0.07236549-0.02608789 1.00000000

Значимость и доверительные интервалы коэффициентов

# Значимость проверяется автоматически; см. результаты функции lm() %r # резюме по коэффициентам coef(summary(d.lm)) # доверительные интервалы confint(d.lm, level=0.9)
EstimateStd. Errort valuePr(>|t|)
(Intercept) 1.043913e+011.695047e+00 6.158610298 5.122049e-06
X2-1.475474e+013.199196e+00 -4.612014693 1.685705e-04
X3 3.128789e-051.323644e-05 2.363769656 2.832335e-02
X4 1.982624e-011.103930e-01 1.795969116 8.762338e-02
X5-8.625347e-055.364297e-02 -0.001607918 9.987330e-01
5 %95 %
(Intercept) 7.515654e+00 1.336261e+01
X2-2.027245e+01-9.237027e+00
X3 8.458765e-06 5.411702e-05
X4 7.865596e-03 3.886592e-01
X5-9.260526e-02 9.243275e-02

Частные коэффициенты детерминации

%r beta <- d.lm$coefficients R2 <- summary(d.lm)$r.squared delta.R2 <- 0 for (k in 2:5) { delta.R2[k] <- ((beta[k]/sqrt(sigma.beta[k,k]))^2)*(1-R2)/(T-N) } delta.R2
  1. 0
  2. 0.322805952080858
  3. 0.0847950451884349
  4. 0.0489505863042629
  5. 3.92362709586809e-08

Прогнозы

# выберем для построения модели 20 случайных наблюдений %r train <- sample(1:25, 20) d.train <- lm(Y ~ X2 + X3 + X4 + X5, data=d, subset=train) summary(d.train)
Call: lm(formula = Y ~ X2 + X3 + X4 + X5, data = d, subset = train) Residuals: Min 1Q Median 3Q Max -2.4972 -0.2969 -0.1291 0.2079 2.6858 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 9.933e+00 1.942e+00 5.114 0.000127 *** X2 -1.448e+01 3.817e+00 -3.795 0.001762 ** X3 2.908e-05 1.475e-05 1.971 0.067424 . X4 2.623e-01 1.205e-01 2.178 0.045810 * X5 1.274e-02 5.768e-02 0.221 0.828201 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 1.078 on 15 degrees of freedom Multiple R-squared: 0.7882, Adjusted R-squared: 0.7317 F-statistic: 13.95 on 4 and 15 DF, p-value: 6.089e-05
%r new <- d[-train,] d.predict <- predict(d.train, new) d.predict d$Y[-train] predict(d.train, new, interval="prediction") predict(d.train, new, interval="confidence")
1
9.89486139746746
9
4.79295905539256
13
7.79459142218283
15
6.54385770695411
16
8.45903229962523
  1. 9.26
  2. 5.88
  3. 6.5
  4. 4.32
  5. 7.37
fitlwrupr
19.894861 7.475786 12.313937
94.792959 1.742130 7.843788
137.794591 4.936873 10.652309
156.543858 3.899736 9.187980
168.459032 6.064698 10.853367
fitlwrupr
19.894861 9.137241 10.652482
94.792959 2.785568 6.800350
137.794591 6.094999 9.494183
156.543858 5.234873 7.852842
168.459032 7.784572 9.133493
# прогнозы для средних %r d[26,] <- c(26, 0, mean(d$X2), mean(d$X3), mean(d$X4), mean(d$X5)) new <- d[26,] predict(d.lm, new) predict(d.lm, new, interval="prediction") predict(d.lm, new, interval="confidence")
26: 8.068
fitlwrupr
268.068 5.66147710.47452
fitlwrupr
268.068 7.5960428.539958