分类:标称型离散数据
回归:数值型连续
from numpy import *
import matplotlib.pyplot as plt
def loadDataSet(fileName): #general function to parse tab -delimited floats
numFeat = len(open(fileName).readline().split('\t')) - 1 #get number of fields
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr =[]
curLine = line.strip().split('\t')
for i in range(numFeat):
lineArr.append(float(curLine[i]))
dataMat.append(lineArr)
labelMat.append(float(curLine[-1]))
return dataMat,labelMat
def standRegres(xArr,yArr):
xMat = mat(xArr); yMat = mat(yArr).T
xTx = xMat.T*xMat
if linalg.det(xTx) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = xTx.I * (xMat.T*yMat)
return ws
fileName='E:/ml/machinelearninginaction/Ch08/ex0.txt'
dataMat,labelMat=loadDataSet(fileName)
ws=standRegres(dataMat,labelMat)
xMat=mat(dataMat);yMat=mat(labelMat)
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
xcopy=xMat.copy()
xcopy.sort(0)#先将x排序,否则画图将出现问题
yhat=xcopy*ws
ax.plot(xcopy[:,1],yhat)
plt.show()
corrcoef(yhat.T,yMat)#计算相关系数,求匹配程度
线性回归容易出现欠拟合
#局部加权线性回归
def lwlr(testPoint,xArr,yArr,k=1.0):
xMat = mat(xArr); yMat = mat(yArr).T
m = shape(xMat)[0]
weights = mat(eye((m)))
for j in range(m): #next 2 lines create weights matrix
diffMat = testPoint - xMat[j,:] #
weights[j,j] = exp(diffMat*diffMat.T/(-2.0*k**2))
xTx = xMat.T * (weights * xMat)
if linalg.det(xTx) == 0.0:
print ("This matrix is singular, cannot do inverse")
return
ws = xTx.I * (xMat.T * (weights * yMat))
return testPoint * ws
def lwlrTest(testArr,xArr,yArr,k=1.0): #loops over all the data points and applies lwlr to each one
m = shape(testArr)[0]
yHat = zeros(m)
for i in range(m):
yHat[i] = lwlr(testArr[i],xArr,yArr,k)
return yHat
yhat=lwlrTest(xMat,xMat,yMat,k=1.0)
strInd=xMat[:,1].argsort(0)
xsort=xMat[strInd][:,0,:]
fig=plt.figure()
ax=fig.add_subplot(111)
ax.scatter(xMat[:,1].flatten().A[0],yMat.T[:,0].flatten().A[0])
ax.plot(xsort[:,1],yhat[strInd])
plt.show()
array([[1. , 0.98647703],
[0.98647703, 1. ]])
yhat=lwlrTest(xMat,xMat,yMat,k=0.003)
array([[1. , 0.99931945],
[0.99931945, 1. ]])
yhat=lwlrTest(xMat,xMat,yMat,k=30)
array([[1. , 0.98647356],
[0.98647356, 1. ]])
def rssError(yArr,yHatArr): #yArr and yHatArr both need to be arrays
return ((yArr-yHatArr)**2).sum()
#较小核容易过拟合
岭回归:特征比样本点还多
def ridgeRegres(xMat,yMat,lam=0.2):
xTx = xMat.T*xMat
denom = xTx + eye(shape(xMat)[1])*lam
if linalg.det(denom) == 0.0:
print( "This matrix is singular, cannot do inverse")
return
ws = denom.I * (xMat.T*yMat)
return ws
def ridgeTest(xArr,yArr):
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #to eliminate X0 take mean off of Y
#regularize X's
xMeans = mean(xMat,0) #calc mean then subtract it off
xVar = var(xMat,0) #calc variance of Xi then divide by it
xMat = (xMat - xMeans)/xVar
numTestPts = 30
wMat = zeros((numTestPts,shape(xMat)[1]))
for i in range(numTestPts):
ws = ridgeRegres(xMat,yMat,exp(i-10))
wMat[i,:]=ws.T
return wMat
ws=ridgeTest(dataMat,labelMat)
fig=plt.figure()
ax=fig.add_subplot(111)
ax.plot(ws)
plt.show()
前向逐步线性回归
def regularize(xMat):#regularize by columns
inMat = xMat.copy()
inMeans = mean(inMat,0) #calc mean then subtract it off
inVar = var(inMat,0) #calc variance of Xi then divide by it
inMat = (inMat - inMeans)/inVar
return inMat
def stageWise(xArr,yArr,eps=0.01,numIt=100):#eps步长
xMat = mat(xArr); yMat=mat(yArr).T
yMean = mean(yMat,0)
yMat = yMat - yMean #can also regularize ys but will get smaller coef
xMat = regularize(xMat)
m,n=shape(xMat)
returnMat = zeros((numIt,n)) #testing code remove
ws = zeros((n,1)); wsTest = ws.copy(); wsMax = ws.copy()
for i in range(numIt):
print (ws.T)
lowestError = inf;
for j in range(n):#分别计算增加或减少该特征对误差影响
for sign in [-1,1]:
wsTest = ws.copy()
wsTest[j] += eps*sign
yTest = xMat*wsTest
rssE = rssError(yMat.A,yTest.A)
if rssE < lowestError:
lowestError = rssE
wsMax = wsTest
ws = wsMax.copy()
returnMat[i,:]=ws.T
return returnMat