from math import sqrt
import pandas as pd
import numpy as np
print('同一个向量三个相关性计算如下:')

def multipl(a1, b):
    sumofab = 0.0
#     a1=a.tolist()
    b1=b.tolist()
    for i in range(len(a1)):
        temp = a1[i] * b1[i]
        sumofab += temp
    print("sumofab:",sumofab)
    return sumofab

def corrcoef(x, y):
    n = len(x)
    # 求和
    sum1 = sum(x)
    sum2 = sum(y)
    # 求乘积之和
    sumofxy = multipl(x, y)
    # 求平方和
    sumofx2 = sum([pow(i, 2) for i in x])
    sumofy2 = sum([pow(j, 2) for j in y])
    num = sumofxy - (float(sum1) * float(sum2) / n)
    # 计算皮尔逊相关系数
    den = sqrt((sumofx2 - float(sum1 ** 2) / n) * (sumofy2 - float(sum2 ** 2) / n))
    if den==0:
        return 0
    else:
        return num / den

textFile = open('4-27-test.csv', 'rb')
data = pd.read_csv(textFile, header=None, prefix='x', error_bad_lines=False)
data=data.dropna()
print(data.ix[:])
# 打乱数据的分布
data.sample(frac=1).reset_index(drop=True)
# data = data[data.ix[:, 19] != '-']
print(data.shape)
for i in range(46):
    if i == 19 or i==23:#因为第19个属性含有“-”和第23个属性是label
#         print("---",data.ix[:,21])
        continue
    else:
#         data = data[data.ix[:,i] != np.nan]
        
        x= [int(x1) for x1 in data.ix[:,i]]
#         print("x=",x[:10])
        y = data.ix[:,23]
        print('第%d个字段与label的皮尔斯相关系数为:%f' % (i, corrcoef(x, y)))  # 0.471404520791

# 欧式距离
# -*-coding:utf-8-*-
def distance(vector1, vector2):
    d = 0;
    for a, b in zip(vector1, vector2):
        d += (a - b) ** 2;
    return d ** 0.5;

print('欧式距离:%f' % distance(x, y));

import numpy as np

# 自定义余弦相似度函数
def get_cossimi(x, y):
    myx = np.array(x)
    myy = np.array(y)
    cos1 = np.sum(myx * myy)
    cos21 = np.sqrt(sum(myy * myy))
    cos22 = np.sqrt(sum(myx * myx))
    return (cos1 / float(cos22 * cos21))

print('余弦相似性:%f' % get_cossimi(x, y))

 

python实现皮尔逊算法

相关文章:

  • 2022-01-13
  • 2022-12-23
  • 2021-04-17
  • 2021-12-22
  • 2022-12-23
  • 2021-07-30
  • 2021-12-05
  • 2021-10-13
猜你喜欢
  • 2021-08-25
  • 2022-03-03
  • 2021-12-28
相关资源
相似解决方案