²²⁵ views

Пример построения и анализа регрессионной модели средствами SageMath

Везде далее рассматривается пример из методички (исходные данные на стр. 14)

Чтение данных

# read data
f = open('data.tsv', 'r')
head = f.readline() # skip the header
Y = []
X = []
for line in f :
    line = line.replace(',', '.').split() # replace commas by points and separate the cells
    row = [float(cell) for cell in line[1:]]
    Y.append(row[0])
    X.append([1] + row[1:]) # add a column of '1's
f.close()
Y = vector(RDF, Y)
X = matrix(RDF, X)
show(Y, X) # check if everything is correct

$\displaystyle \left(9.26,\,9.38,\,12.11,\,10.81,\,9.35,\,9.87,\,8.17,\,9.12,\,5.88,\,6.3,\,6.22,\,5.49,\,6.5,\,6.61,\,4.32,\,7.37,\,7.02,\,8.25,\,8.15,\,8.72,\,6.64,\,8.1,\,5.52,\,9.37,\,13.17\right)$ $\displaystyle \left(\begin{array}{rrrrr} 1.0 & 0.23 & 47750.0 & 6.4 & 17.72 \\ 1.0 & 0.24 & 50391.0 & 7.8 & 18.39 \\ 1.0 & 0.19 & 43149.0 & 9.76 & 26.46 \\ 1.0 & 0.17 & 41089.0 & 7.9 & 22.37 \\ 1.0 & 0.23 & 14257.0 & 5.35 & 28.13 \\ 1.0 & 0.43 & 22661.0 & 9.9 & 17.55 \\ 1.0 & 0.31 & 52509.0 & 4.5 & 21.92 \\ 1.0 & 0.26 & 14903.0 & 4.88 & 19.52 \\ 1.0 & 0.49 & 25587.0 & 3.46 & 23.99 \\ 1.0 & 0.36 & 16821.0 & 3.6 & 21.76 \\ 1.0 & 0.37 & 19459.0 & 3.56 & 25.68 \\ 1.0 & 0.43 & 12973.0 & 5.65 & 18.13 \\ 1.0 & 0.35 & 50907.0 & 4.28 & 25.74 \\ 1.0 & 0.38 & 6920.0 & 8.85 & 21.21 \\ 1.0 & 0.42 & 5736.0 & 8.52 & 22.97 \\ 1.0 & 0.3 & 26705.0 & 7.19 & 16.38 \\ 1.0 & 0.32 & 20068.0 & 4.82 & 13.21 \\ 1.0 & 0.25 & 11487.0 & 5.46 & 14.48 \\ 1.0 & 0.31 & 32029.0 & 6.2 & 13.38 \\ 1.0 & 0.26 & 18946.0 & 4.25 & 13.69 \\ 1.0 & 0.37 & 28025.0 & 5.38 & 16.66 \\ 1.0 & 0.29 & 20968.0 & 5.88 & 15.06 \\ 1.0 & 0.34 & 11049.0 & 9.27 & 20.09 \\ 1.0 & 0.23 & 45893.0 & 4.36 & 15.98 \\ 1.0 & 0.17 & 99400.0 & 10.31 & 18.27 \end{array}\right)$

Настройка модели методом 1МНК

# computing 'beta_hat' coefficients
beta_hat = (X.transpose()*X)^(-1)*X.transpose()*Y
show(r'$\widehat{\beta}='+latex(beta_hat)+'$')

$\widehat{\beta}= \left(10.4391313616,\,-14.7547393191,\,3.12878927408 \times 10^{-05},\,0.198262382765,\,-8.62534668938 \times 10^{-05}\right) $

Вычисление остатков

# computing 'u_hat' residuals
Y_hat = X*beta_hat # response (y) predicted by a model
u_hat = Y - Y_hat # just residuals
show(r'$\widehat{u}='+latex(u_hat)+'$')

$\widehat{u}= \left(-0.546889034888,\,-0.639482512475,\,1.19146923606,\,0.0292427639867,\,0.800109757172,\,3.10510776713,\,-0.228348179101,\,1.08498063558,\,1.18620896945,\,-0.0655865530808,\,-0.0723080120393,\,-0.129109974233,\,-1.21408818972,\,-0.191635289598,\,-1.78882245942,\,-0.902346441854,\,-0.279985487689,\,0.058885784518,\,0.0546452089217,\,0.682886128663,\,-0.281935644161,\,0.119214672455,\,-2.08437937613,\,0.0255177617402,\,0.0866484687017\right) $

# графическая проверка нормальности распределения остатков
a, b = min(u_hat), max(u_hat)
h = (b-a)/10
q = []
for i in xrange(10) :
    m = len([p for p in u_hat if a+i*h < p < a+(i+1)*h])
    q.append(float(m)/len(u_hat))
list_plot(q, plotjoined=True)
bar_chart(q)

print u_hat

(-0.5468890348882383, -0.6394825124745029, 1.1914692360582286, 0.029242763986664144, 0.8001097571719047, 3.105107767131045, -0.22834817910063343, 1.0849806355832943, 1.1862089694514628, -0.06558655308077999, -0.07230801203933712, -0.12910997423303794, -1.2140881897151417, -0.1916352895978788, -1.7888224594156386, -0.9023464418541947, -0.2799854876885579, 0.05888578451796178, 0.05464520892173752, 0.6828861286625809, -0.2819356441609493, 0.11921467245509199, -2.084379376131719, 0.025517761740225353, 0.08664846870165377)

# тест Шапиро - Уилка, если вы всё ещё не уверены :)
# UPD 16.04.17: p-value для этого теста должно быть БОЛЬШЕ 0.05 (нулевая гипотеза - распределение нормальное)
# Значит, тест показывает, что нормального распределения у нас нет, хотя было очень похоже (не дотянули совсем немного до 0.05)
%r
u = c(-0.5468890348882383, -0.6394825124745029, 1.1914692360582286, 0.029242763986664144, 0.8001097571719047, 3.105107767131045, -0.22834817910063343, 1.0849806355832943, 1.1862089694514628, -0.06558655308077999, -0.07230801203933712, -0.12910997423303794, -1.2140881897151417, -0.1916352895978788, -1.7888224594156386, -0.9023464418541947, -0.2799854876885579, 0.05888578451796178, 0.05464520892173752, 0.6828861286625809, -0.2819356441609493, 0.11921467245509199, -2.084379376131719, 0.025517761740225353, 0.08664846870165377)
shapiro.test(u)

	Shapiro-Wilk normality test

data:  u
W = 0.91782, p-value = 0.04571

Оценка качества модели

# model quality - R^2, MSE, MAPE etc.
y_mean = mean(Y) # y average
T = X.nrows() # length of the sample
N = X.ncols() # number of predictors (with constant)
TSS = sum([(y-y_mean)^2 for y in Y]) # total sum of squares
ESS = sum([(yh-y_mean)^2 for yh in Y_hat]) # explained sum of squares
RSS = sum([uh^2 for uh in u_hat]) # residual sum of squares
R2 = ESS/TSS # R^2
show(r'$R^2='+latex(R2)+'$')
R2T = 1 - (1-R2)*(T-1)/(T-N) # adjusted R^2
show(r'$R^2_T='+latex(R2T)+'$')
R2A = 1 - (1-R2)*(T+N)/(T-N) # adjusted R^2
show(r'$R^2_A='+latex(R2A)+'$')
sigma_u2 = RSS/(T-N) # variance of residuals
show(r'$\widehat{\sigma}_u^2='+latex(sigma_u2)+'$')
MSE = RSS/T # MSE
show('$MSE='+latex(MSE)+'$')
MAPE = 100/T*sum([abs(u_hat[i])/Y[i] for i in range(T)]) # MAPE
show('$MAPE='+latex(MAPE)+'\,\%$')
F_stat = R2*(T-N)/(1-R2)/(N-1) # F-statistics
import scipy.stats
F_crit = scipy.stats.f.isf(0.1, N-1, T-N) # F-critical (or use F-distribution table directly)
if F_stat > F_crit :
    show('Model is significant')
else :
    show('Model is NOT significant')

$R^2= 0.762831668914 $

$R^2_T= 0.715398002697 $

$R^2_A= 0.644247503372 $

$\widehat{\sigma}_u^2= 1.27977454464 $

$MSE= 1.02381963571 $

$MAPE= 9.404480739 \,\%$

Model is significant

Стандартизированная модель

# normalizing the data
Y_norm = vector([(y-y_mean)/std(list(Y)) for y in Y])
X_norm = matrix([[(X[i,j]-mean(X.column(j)))/std(list(X.column(j))) for j in xrange(1, N)] for i in xrange(T)])
# standartized equation
beta_hat_st = (X_norm.transpose()*X_norm)^(-1)*X_norm.transpose()*Y_norm
show(r'$\widehat{\beta}^{\text{st}}='+latex(beta_hat_st)+'$')

$\widehat{\beta}^{\text{st}}= \left(-0.597874481002,\,0.30571496364,\,0.202240460487,\,-0.000176545406672\right) $

Коэффициенты эластичности

# elasticity
for k in xrange(1, N) :
    elast = beta_hat[k]*mean(X.column(k))/mean(Y)
    show(r'$\varepsilon_'+str(k+1)+'='+latex(elast)+'$')

$\varepsilon_2= -0.563269671576 $

$\varepsilon_3= 0.114740163997 $

$\varepsilon_4= 0.15484518174 $

$\varepsilon_5= -0.000209001087802 $

Дисперсионно-ковариационная матрица коэффициентов регрессии

# 'beta' variance-covariance matrix
Sigma_beta_hat = sigma_u2*(X.transpose()*X)^(-1)
show(r'$\widehat{\Sigma}_{\beta}='+latex(Sigma_beta_hat)+'$')

$\widehat{\Sigma}_{\beta}= \left(\begin{array}{rrrrr} 2.87318292909 & -3.61659764486 & -9.47261669994 \times 10^{-06} & -0.0805724181663 & -0.0470653102213 \\ -3.61659764486 & 10.234855547 & 2.12695950768 \times 10^{-05} & 0.0412592508087 & -0.0217411159215 \\ -9.47261669994 \times 10^{-06} & 2.12695950768 \times 10^{-05} & 1.75203316052 \times 10^{-10} & -1.99596943345 \times 10^{-07} & -5.13825292209 \times 10^{-08} \\ -0.0805724181663 & 0.0412592508087 & -1.99596943345 \times 10^{-07} & 0.0121866100359 & -0.000154487436638 \\ -0.0470653102213 & -0.0217411159215 & -5.13825292209 \times 10^{-08} & -0.000154487436638 & 0.00287756779883 \end{array}\right) $

# 'beta' correlation matrix
r = matrix([[Sigma_beta_hat[i,j]/sqrt(Sigma_beta_hat[i,i])/sqrt(Sigma_beta_hat[j,j]) for j in xrange(N)] \
            for i in xrange(N)])
show('$r='+latex(r)+'$')

$r= \left(\begin{array}{rrrrr} 1.0 & -0.666926075322 & -0.422198965444 & -0.430589368086 & -0.517614584211 \\ -0.666926075322 & 1.0 & 0.502281522981 & 0.116825869462 & -0.126685873568 \\ -0.422198965444 & 0.502281522981 & 1.0 & -0.13659703362 & -0.0723654945697 \\ -0.430589368086 & 0.116825869462 & -0.13659703362 & 1.0 & -0.0260878883769 \\ -0.517614584211 & -0.126685873568 & -0.0723654945697 & -0.0260878883769 & 1.0 \end{array}\right) $

Проверка значимости коэффициентов и доверительные интервалы

# t-test for 'beta' coefficients
for k in xrange(N) :
    t_stat = beta_hat[k]/sqrt(Sigma_beta_hat[k,k])
    import scipy.stats
    t_crit = scipy.stats.t.isf(0.1, T-N)
    if abs(t_stat) > t_crit :
        low = beta_hat[k] - t_crit*sqrt(Sigma_beta_hat[k,k])
        up = beta_hat[k] + t_crit*sqrt(Sigma_beta_hat[k,k])
        show(r'$\widehat{\beta}_'+str(k+1)+'$'+'is significant: '+\
             r'$\widehat{\beta}_'+str(k+1)+r'\in'+latex((low, up))+'$')
    else :
        show(r'$\widehat{\beta}_'+str(k+1)+'$'+'is NOT significant')

$\widehat{\beta}_1$is significant: $\widehat{\beta}_1\in \left(8.19261712083, 12.6856456024\right) $

$\widehat{\beta}_2$is significant: $\widehat{\beta}_2\in \left(-18.9947641112, -10.514714527\right) $

$\widehat{\beta}_3$is significant: $\widehat{\beta}_3\in \left(1.37451013861 \times 10^{-05}, 4.88306840955 \times 10^{-05}\right) $

$\widehat{\beta}_4$is significant: $\widehat{\beta}_4\in \left(0.05195407259, 0.344570692941\right) $

$\widehat{\beta}_5$is NOT significant

Точечный прогноз

# prediction point
x_pred = vector([mean(x) for x in X.columns()])
Y_ppred = beta_hat * x_pred
show('Prediction point:', Y_ppred)

Prediction point: $\displaystyle 8.068$

Интервальные прогнозы

# prediction interval for mathematical expectation
low = Y_ppred - t_crit * sqrt(x_pred*(Sigma_beta_hat*x_pred))
up = Y_ppred + t_crit * sqrt(x_pred*(Sigma_beta_hat*x_pred))
show('Prediction interval for mathematical expectation: '+'$'+latex((low, up))+'$')

Prediction interval for mathematical expectation: $ \left(7.76813604364, 8.36786395636\right) $

# prediction interval for individual value
low = Y_ppred - t_crit * sqrt(sigma_u2+x_pred*(Sigma_beta_hat*x_pred))
up = Y_ppred + t_crit * sqrt(sigma_u2+x_pred*(Sigma_beta_hat*x_pred))
show('Prediction interval for individual value: '+'$'+latex((low, up))+'$')

Prediction interval for individual value: $ \left(6.53898783512, 9.59701216488\right) $

Частные коэффициенты детерминации

# partial determination coefficients
for k in xrange(1, N) :
    delta_R2 = (1-R2)/(T-N)*(beta_hat[k]/sqrt(Sigma_beta_hat[k,k]))^2
    show(r'$\Delta R^2_'+str(k+1)+'='+latex(delta_R2)+'$')

$\Delta R^2_2= 0.252236578258 $

$\Delta R^2_3= 0.0662577995036 $

$\Delta R^2_4= 0.0382493826818 $

$\Delta R^2_5= 3.06587368244 \times 10^{-08} $

Спецификация модели

# model specification
# Ramsey test
RSS1 = RSS # RSS for model (1)
X_new = X
for p in xrange(2, 5) :
    X_new = X_new.augment(vector([yh^p for yh in Y_hat]))
beta_new = (X_new.transpose()*X_new)^(-1)*X_new.transpose()*Y
Y_hat_new = X_new * beta_new
RSS2 = sum([(Y[i]-Y_hat_new[i])^2 for i in xrange(T)]) # RSS for model (2)
F_stat = (RSS1-RSS2)*(T-3)/3/RSS2
F_crit = scipy.stats.f.isf(0.1, 3, T-3)
if F_stat > F_crit :
    show('Model is badly-specified')
else :
    show('Model is well-specified')
# Amemiya test
AF = RSS * (T+N) / (T-N)
show('$AF='+latex(AF)+'$')

Model is well-specified

$AF= 38.3932363391 $