Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
225 views

Пример построения и анализа регрессионной модели средствами SageMath

Везде далее рассматривается пример из методички (исходные данные на стр. 14)

Чтение данных

# read data f = open('data.tsv', 'r') head = f.readline() # skip the header Y = [] X = [] for line in f : line = line.replace(',', '.').split() # replace commas by points and separate the cells row = [float(cell) for cell in line[1:]] Y.append(row[0]) X.append([1] + row[1:]) # add a column of '1's f.close() Y = vector(RDF, Y) X = matrix(RDF, X) show(Y, X) # check if everything is correct
$\displaystyle \left(9.26,\,9.38,\,12.11,\,10.81,\,9.35,\,9.87,\,8.17,\,9.12,\,5.88,\,6.3,\,6.22,\,5.49,\,6.5,\,6.61,\,4.32,\,7.37,\,7.02,\,8.25,\,8.15,\,8.72,\,6.64,\,8.1,\,5.52,\,9.37,\,13.17\right)$ $\displaystyle \left(\begin{array}{rrrrr} 1.0 & 0.23 & 47750.0 & 6.4 & 17.72 \\ 1.0 & 0.24 & 50391.0 & 7.8 & 18.39 \\ 1.0 & 0.19 & 43149.0 & 9.76 & 26.46 \\ 1.0 & 0.17 & 41089.0 & 7.9 & 22.37 \\ 1.0 & 0.23 & 14257.0 & 5.35 & 28.13 \\ 1.0 & 0.43 & 22661.0 & 9.9 & 17.55 \\ 1.0 & 0.31 & 52509.0 & 4.5 & 21.92 \\ 1.0 & 0.26 & 14903.0 & 4.88 & 19.52 \\ 1.0 & 0.49 & 25587.0 & 3.46 & 23.99 \\ 1.0 & 0.36 & 16821.0 & 3.6 & 21.76 \\ 1.0 & 0.37 & 19459.0 & 3.56 & 25.68 \\ 1.0 & 0.43 & 12973.0 & 5.65 & 18.13 \\ 1.0 & 0.35 & 50907.0 & 4.28 & 25.74 \\ 1.0 & 0.38 & 6920.0 & 8.85 & 21.21 \\ 1.0 & 0.42 & 5736.0 & 8.52 & 22.97 \\ 1.0 & 0.3 & 26705.0 & 7.19 & 16.38 \\ 1.0 & 0.32 & 20068.0 & 4.82 & 13.21 \\ 1.0 & 0.25 & 11487.0 & 5.46 & 14.48 \\ 1.0 & 0.31 & 32029.0 & 6.2 & 13.38 \\ 1.0 & 0.26 & 18946.0 & 4.25 & 13.69 \\ 1.0 & 0.37 & 28025.0 & 5.38 & 16.66 \\ 1.0 & 0.29 & 20968.0 & 5.88 & 15.06 \\ 1.0 & 0.34 & 11049.0 & 9.27 & 20.09 \\ 1.0 & 0.23 & 45893.0 & 4.36 & 15.98 \\ 1.0 & 0.17 & 99400.0 & 10.31 & 18.27 \end{array}\right)$

Настройка модели методом 1МНК

# computing 'beta_hat' coefficients beta_hat = (X.transpose()*X)^(-1)*X.transpose()*Y show(r'$\widehat{\beta}='+latex(beta_hat)+'$')
$\widehat{\beta}= \left(10.4391313616,\,-14.7547393191,\,3.12878927408 \times 10^{-05},\,0.198262382765,\,-8.62534668938 \times 10^{-05}\right) $

Вычисление остатков

# computing 'u_hat' residuals Y_hat = X*beta_hat # response (y) predicted by a model u_hat = Y - Y_hat # just residuals show(r'$\widehat{u}='+latex(u_hat)+'$')
$\widehat{u}= \left(-0.546889034888,\,-0.639482512475,\,1.19146923606,\,0.0292427639867,\,0.800109757172,\,3.10510776713,\,-0.228348179101,\,1.08498063558,\,1.18620896945,\,-0.0655865530808,\,-0.0723080120393,\,-0.129109974233,\,-1.21408818972,\,-0.191635289598,\,-1.78882245942,\,-0.902346441854,\,-0.279985487689,\,0.058885784518,\,0.0546452089217,\,0.682886128663,\,-0.281935644161,\,0.119214672455,\,-2.08437937613,\,0.0255177617402,\,0.0866484687017\right) $
# графическая проверка нормальности распределения остатков a, b = min(u_hat), max(u_hat) h = (b-a)/10 q = [] for i in xrange(10) : m = len([p for p in u_hat if a+i*h < p < a+(i+1)*h]) q.append(float(m)/len(u_hat)) list_plot(q, plotjoined=True) bar_chart(q)
print u_hat
(-0.5468890348882383, -0.6394825124745029, 1.1914692360582286, 0.029242763986664144, 0.8001097571719047, 3.105107767131045, -0.22834817910063343, 1.0849806355832943, 1.1862089694514628, -0.06558655308077999, -0.07230801203933712, -0.12910997423303794, -1.2140881897151417, -0.1916352895978788, -1.7888224594156386, -0.9023464418541947, -0.2799854876885579, 0.05888578451796178, 0.05464520892173752, 0.6828861286625809, -0.2819356441609493, 0.11921467245509199, -2.084379376131719, 0.025517761740225353, 0.08664846870165377)
# тест Шапиро - Уилка, если вы всё ещё не уверены :) # UPD 16.04.17: p-value для этого теста должно быть БОЛЬШЕ 0.05 (нулевая гипотеза - распределение нормальное) # Значит, тест показывает, что нормального распределения у нас нет, хотя было очень похоже (не дотянули совсем немного до 0.05) %r u = c(-0.5468890348882383, -0.6394825124745029, 1.1914692360582286, 0.029242763986664144, 0.8001097571719047, 3.105107767131045, -0.22834817910063343, 1.0849806355832943, 1.1862089694514628, -0.06558655308077999, -0.07230801203933712, -0.12910997423303794, -1.2140881897151417, -0.1916352895978788, -1.7888224594156386, -0.9023464418541947, -0.2799854876885579, 0.05888578451796178, 0.05464520892173752, 0.6828861286625809, -0.2819356441609493, 0.11921467245509199, -2.084379376131719, 0.025517761740225353, 0.08664846870165377) shapiro.test(u)
Shapiro-Wilk normality test data: u W = 0.91782, p-value = 0.04571

Оценка качества модели

# model quality - R^2, MSE, MAPE etc. y_mean = mean(Y) # y average T = X.nrows() # length of the sample N = X.ncols() # number of predictors (with constant) TSS = sum([(y-y_mean)^2 for y in Y]) # total sum of squares ESS = sum([(yh-y_mean)^2 for yh in Y_hat]) # explained sum of squares RSS = sum([uh^2 for uh in u_hat]) # residual sum of squares R2 = ESS/TSS # R^2 show(r'$R^2='+latex(R2)+'$') R2T = 1 - (1-R2)*(T-1)/(T-N) # adjusted R^2 show(r'$R^2_T='+latex(R2T)+'$') R2A = 1 - (1-R2)*(T+N)/(T-N) # adjusted R^2 show(r'$R^2_A='+latex(R2A)+'$') sigma_u2 = RSS/(T-N) # variance of residuals show(r'$\widehat{\sigma}_u^2='+latex(sigma_u2)+'$') MSE = RSS/T # MSE show('$MSE='+latex(MSE)+'$') MAPE = 100/T*sum([abs(u_hat[i])/Y[i] for i in range(T)]) # MAPE show('$MAPE='+latex(MAPE)+'\,\%$') F_stat = R2*(T-N)/(1-R2)/(N-1) # F-statistics import scipy.stats F_crit = scipy.stats.f.isf(0.1, N-1, T-N) # F-critical (or use F-distribution table directly) if F_stat > F_crit : show('Model is significant') else : show('Model is NOT significant')
$R^2= 0.762831668914 $
$R^2_T= 0.715398002697 $
$R^2_A= 0.644247503372 $
$\widehat{\sigma}_u^2= 1.27977454464 $
$MSE= 1.02381963571 $
$MAPE= 9.404480739 \,\%$
Model is significant

Стандартизированная модель

# normalizing the data Y_norm = vector([(y-y_mean)/std(list(Y)) for y in Y]) X_norm = matrix([[(X[i,j]-mean(X.column(j)))/std(list(X.column(j))) for j in xrange(1, N)] for i in xrange(T)]) # standartized equation beta_hat_st = (X_norm.transpose()*X_norm)^(-1)*X_norm.transpose()*Y_norm show(r'$\widehat{\beta}^{\text{st}}='+latex(beta_hat_st)+'$')
$\widehat{\beta}^{\text{st}}= \left(-0.597874481002,\,0.30571496364,\,0.202240460487,\,-0.000176545406672\right) $

Коэффициенты эластичности

# elasticity for k in xrange(1, N) : elast = beta_hat[k]*mean(X.column(k))/mean(Y) show(r'$\varepsilon_'+str(k+1)+'='+latex(elast)+'$')
$\varepsilon_2= -0.563269671576 $
$\varepsilon_3= 0.114740163997 $
$\varepsilon_4= 0.15484518174 $
$\varepsilon_5= -0.000209001087802 $

Дисперсионно-ковариационная матрица коэффициентов регрессии

# 'beta' variance-covariance matrix Sigma_beta_hat = sigma_u2*(X.transpose()*X)^(-1) show(r'$\widehat{\Sigma}_{\beta}='+latex(Sigma_beta_hat)+'$')
$\widehat{\Sigma}_{\beta}= \left(\begin{array}{rrrrr} 2.87318292909 & -3.61659764486 & -9.47261669994 \times 10^{-06} & -0.0805724181663 & -0.0470653102213 \\ -3.61659764486 & 10.234855547 & 2.12695950768 \times 10^{-05} & 0.0412592508087 & -0.0217411159215 \\ -9.47261669994 \times 10^{-06} & 2.12695950768 \times 10^{-05} & 1.75203316052 \times 10^{-10} & -1.99596943345 \times 10^{-07} & -5.13825292209 \times 10^{-08} \\ -0.0805724181663 & 0.0412592508087 & -1.99596943345 \times 10^{-07} & 0.0121866100359 & -0.000154487436638 \\ -0.0470653102213 & -0.0217411159215 & -5.13825292209 \times 10^{-08} & -0.000154487436638 & 0.00287756779883 \end{array}\right) $
# 'beta' correlation matrix r = matrix([[Sigma_beta_hat[i,j]/sqrt(Sigma_beta_hat[i,i])/sqrt(Sigma_beta_hat[j,j]) for j in xrange(N)] \ for i in xrange(N)]) show('$r='+latex(r)+'$')
$r= \left(\begin{array}{rrrrr} 1.0 & -0.666926075322 & -0.422198965444 & -0.430589368086 & -0.517614584211 \\ -0.666926075322 & 1.0 & 0.502281522981 & 0.116825869462 & -0.126685873568 \\ -0.422198965444 & 0.502281522981 & 1.0 & -0.13659703362 & -0.0723654945697 \\ -0.430589368086 & 0.116825869462 & -0.13659703362 & 1.0 & -0.0260878883769 \\ -0.517614584211 & -0.126685873568 & -0.0723654945697 & -0.0260878883769 & 1.0 \end{array}\right) $

Проверка значимости коэффициентов и доверительные интервалы

# t-test for 'beta' coefficients for k in xrange(N) : t_stat = beta_hat[k]/sqrt(Sigma_beta_hat[k,k]) import scipy.stats t_crit = scipy.stats.t.isf(0.1, T-N) if abs(t_stat) > t_crit : low = beta_hat[k] - t_crit*sqrt(Sigma_beta_hat[k,k]) up = beta_hat[k] + t_crit*sqrt(Sigma_beta_hat[k,k]) show(r'$\widehat{\beta}_'+str(k+1)+'$'+'is significant: '+\ r'$\widehat{\beta}_'+str(k+1)+r'\in'+latex((low, up))+'$') else : show(r'$\widehat{\beta}_'+str(k+1)+'$'+'is NOT significant')
$\widehat{\beta}_1$is significant: $\widehat{\beta}_1\in \left(8.19261712083, 12.6856456024\right) $
$\widehat{\beta}_2$is significant: $\widehat{\beta}_2\in \left(-18.9947641112, -10.514714527\right) $
$\widehat{\beta}_3$is significant: $\widehat{\beta}_3\in \left(1.37451013861 \times 10^{-05}, 4.88306840955 \times 10^{-05}\right) $
$\widehat{\beta}_4$is significant: $\widehat{\beta}_4\in \left(0.05195407259, 0.344570692941\right) $
$\widehat{\beta}_5$is NOT significant

Точечный прогноз

# prediction point x_pred = vector([mean(x) for x in X.columns()]) Y_ppred = beta_hat * x_pred show('Prediction point:', Y_ppred)
Prediction point: $\displaystyle 8.068$

Интервальные прогнозы

# prediction interval for mathematical expectation low = Y_ppred - t_crit * sqrt(x_pred*(Sigma_beta_hat*x_pred)) up = Y_ppred + t_crit * sqrt(x_pred*(Sigma_beta_hat*x_pred)) show('Prediction interval for mathematical expectation: '+'$'+latex((low, up))+'$')
Prediction interval for mathematical expectation: $ \left(7.76813604364, 8.36786395636\right) $
# prediction interval for individual value low = Y_ppred - t_crit * sqrt(sigma_u2+x_pred*(Sigma_beta_hat*x_pred)) up = Y_ppred + t_crit * sqrt(sigma_u2+x_pred*(Sigma_beta_hat*x_pred)) show('Prediction interval for individual value: '+'$'+latex((low, up))+'$')
Prediction interval for individual value: $ \left(6.53898783512, 9.59701216488\right) $

Частные коэффициенты детерминации

# partial determination coefficients for k in xrange(1, N) : delta_R2 = (1-R2)/(T-N)*(beta_hat[k]/sqrt(Sigma_beta_hat[k,k]))^2 show(r'$\Delta R^2_'+str(k+1)+'='+latex(delta_R2)+'$')
$\Delta R^2_2= 0.252236578258 $
$\Delta R^2_3= 0.0662577995036 $
$\Delta R^2_4= 0.0382493826818 $
$\Delta R^2_5= 3.06587368244 \times 10^{-08} $

Спецификация модели

# model specification # Ramsey test RSS1 = RSS # RSS for model (1) X_new = X for p in xrange(2, 5) : X_new = X_new.augment(vector([yh^p for yh in Y_hat])) beta_new = (X_new.transpose()*X_new)^(-1)*X_new.transpose()*Y Y_hat_new = X_new * beta_new RSS2 = sum([(Y[i]-Y_hat_new[i])^2 for i in xrange(T)]) # RSS for model (2) F_stat = (RSS1-RSS2)*(T-3)/3/RSS2 F_crit = scipy.stats.f.isf(0.1, 3, T-3) if F_stat > F_crit : show('Model is badly-specified') else : show('Model is well-specified') # Amemiya test AF = RSS * (T+N) / (T-N) show('$AF='+latex(AF)+'$')
Model is well-specified
$AF= 38.3932363391 $