我制作了一个用于比较不同 PCA 的小脚本,作为答案出现在这里:
import numpy as np
from scipy.linalg import svd
shape = (26424, 144)
repeat = 20
pca_components = 2
data = np.array(np.random.randint(255, size=shape)).astype('float64')
# data normalization
# data.dot(data.T)
# (U, s, Va) = svd(data, full_matrices=False)
# data = data / s[0]
from fbpca import diffsnorm
from timeit import default_timer as timer
from scipy.linalg import svd
start = timer()
for i in range(repeat):
(U, s, Va) = svd(data, full_matrices=False)
time = timer() - start
err = diffsnorm(data, U, s, Va)
print('svd time: %.3fms, error: %E' % (time*1000/repeat, err))
from matplotlib.mlab import PCA
start = timer()
_pca = PCA(data)
for i in range(repeat):
U = _pca.project(data)
time = timer() - start
err = diffsnorm(data, U, _pca.fracs, _pca.Wt)
print('matplotlib PCA time: %.3fms, error: %E' % (time*1000/repeat, err))
from fbpca import pca
start = timer()
for i in range(repeat):
(U, s, Va) = pca(data, pca_components, True)
time = timer() - start
err = diffsnorm(data, U, s, Va)
print('facebook pca time: %.3fms, error: %E' % (time*1000/repeat, err))
from sklearn.decomposition import PCA
start = timer()
_pca = PCA(n_components = pca_components)
_pca.fit(data)
for i in range(repeat):
U = _pca.transform(data)
time = timer() - start
err = diffsnorm(data, U, _pca.explained_variance_, _pca.components_)
print('sklearn PCA time: %.3fms, error: %E' % (time*1000/repeat, err))
start = timer()
for i in range(repeat):
(U, s, Va) = pca_mark(data, pca_components)
time = timer() - start
err = diffsnorm(data, U, s, Va.T)
print('pca by Mark time: %.3fms, error: %E' % (time*1000/repeat, err))
start = timer()
for i in range(repeat):
(U, s, Va) = pca_doug(data, pca_components)
time = timer() - start
err = diffsnorm(data, U, s[:pca_components], Va.T)
print('pca by doug time: %.3fms, error: %E' % (time*1000/repeat, err))
pca_mark 是pca in Mark's answer。
pca_doug 是pca in doug's answer。
这是一个示例输出(但结果很大程度上取决于数据大小和 pca_components,因此我建议使用您自己的数据运行您自己的测试。此外,facebook 的 pca 针对标准化数据进行了优化,因此它会在这种情况下更快更准确):
svd time: 3212.228ms, error: 1.907320E-10
matplotlib PCA time: 879.210ms, error: 2.478853E+05
facebook pca time: 485.483ms, error: 1.260335E+04
sklearn PCA time: 169.832ms, error: 7.469847E+07
pca by Mark time: 293.758ms, error: 1.713129E+02
pca by doug time: 300.326ms, error: 1.707492E+02
编辑:
来自 fbpca 的 diffsnorm 函数计算 Schur 分解的谱范数误差。