一、机器学习系统设计笔记之python机器学习入门

103 阅读 0 评论 68 点赞

我是靠谱客的博主难过热狗，最近开发中收集的这篇文章主要介绍一、机器学习系统设计笔记之python机器学习入门，觉得挺不错的，现在分享给大家，希望可以做个参考。

概述

一，学习Numpy

因为numpy.array会遮挡python自带的数组模块，应该使用以下方式：

>>> numpy.version.full_version
'1.9.3'
>>> import numpy as np
>>> a = np.array([0,1,2,3,4,5])
>>> a
array([0, 1, 2, 3, 4, 5])
>>> a.ndim
1
>>> a.shape
(6,)

将这个数组转换到二维矩阵：

>>> b = a.reshape((3,2))
>>> b
array([[0, 1],
[2, 3],
[4, 5]])
>>> b.ndim
2
>>> b.shape
(3, 2)
>>>

numpy在所有可能之处都避免复制操作，如：

>>> b[1][0]=100
>>> b
array([[
0,
1],
[100,
3],
[
4,
5]])
>>> a
array([
0,
1, 100,
3,
4,
5])

如果需要真正的副本，应该这样：

>>> c =a.reshape((3,2)).copy()
>>> c
array([[
0,
1],
[100,
3],
[
4,
5]])
>>> c[0][0]=100
>>> c
array([[100,
1],
[100,
3],
[
4,
5]])
>>> a
array([
0,
1, 100,
3,
4,
5])
>>> b
array([[
0,
1],
[100,
3],
[
4,
5]])
>>>

numpy对数组的操作传递到每个元素上：

>>> a*2
array([
0,
2, 200,
6,
8,
10])
>>> a**2
array([
0,
1, 10000,
9,
16,
25])
>>>

1、索引：

除了正常的列表索引，numpy还允许将数组当做索引使用

>>> a[np.array([2,3,4])]
array([100,
3,
4])
>>> a>4
array([False, False,
True, False, False,
True], dtype=bool)
>>> a[a>4]=4
>>> a
array([0, 1, 4, 3, 4, 4])
>>>

鉴于经常修剪异常值，可以用专门的函数处理：

>>> a
array([0, 1, 4, 3, 4, 4])
>>> a.clip(0,3)
array([0, 1, 3, 3, 3, 3])
>>>

2、处理不存在的值：

numpy.NAN标记的表示它不是真实的数值

>>> c=np.array([1,2,np.NAN,3,4])
>>> c
array([
1.,
2.,
nan,
3.,
4.])
>>> np.isnan(c)
array([False, False,
True, False, False], dtype=bool)
>>> c[~np.isnan(c)]
array([ 1.,
2.,
3.,
4.])
>>> np.mean(c[~np.isnan(c)])
2.5
>>>

3、运行时比较

import timeit
normal_py_sec=timeit.timeit('sum(x*x for x in range(1000))',number=1000)
naive_np_sec=timeit.timeit('sum(na*na)',setup="import numpy as np;na=np.arange(1000)",number=1000)
good_np_sec=timeit.timeit('na.dot(na)',setup="import numpy as np;na=np.arange(1000)",number=1000)
print("Normal Python :%f sec"%normal_py_sec)
print("Naive Numpy:%f sec"%naive_np_sec)
print("Good Numpy:%f sec"%good_np_sec)

结果如下：

Normal Python :0.101323 sec

Naive Numpy:0.083080 sec

Good Numpy:0.001787 sec

二、实例应用

1、读取数据

使用scipy的genfromtxt（）很容易读取数据

import scipy as sp
data=sp.genfromtxt("web_traffic_tsv",delimiter="t")

2、预处理和数据清洗

1）、获取x、y方向坐标

x = data[:,0]
y = data[:,1]

2）、清除无效数据

查看无效值多少

sp.sum(sp.isnan(y))

3）、画出数据

def showData():
data = sp.genfromtxt("web_traffic.tsv",delimiter="t")
x = data[:,0]
y = data[:,1]
nan = sp.sum(sp.isnan(y))
x=x[~sp.isnan(y)]
y = y[~sp.isnan(y)]
plt.scatter(x,y)
plt.title("web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hitshour")
plt.xticks([w * 7 * 24 for w in range(10)],
['week %i'%w for w in
range(10)])
plt.autoscale(tight=True)
plt.grid()
plt.show()
print(data)

3、选择正确的模型和算法

1）、近似误差

def error(f,x,y):
return sp.sum((f(x)-y)**2)

2）、一阶模型

import scipy as
sp
import numpy as np
import matplotlib.pyplot as plt
def showData():
data = sp.genfromtxt("web_traffic.tsv",delimiter="t")
x = data[:,0]
y = data[:,1]
nan = sp.sum(sp.isnan(y))
x=x[~sp.isnan(y)]
y = y[~sp.isnan(y)]
fp1,residuals,rank,sv,rcond=sp.polyfit(x,y,1,full=True)
print("Model parameters %s" % fp1)
f1=sp.poly1d(fp1)
fx = sp.linspace(0,x[-1],1000)
plt.plot(fx,f1(fx),linewidth=4)
plt.legend(["d=%i" %f1.order],loc="upper left")
plt.scatter(x,y)
plt.title("web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hitshour")
plt.xticks([w * 7 * 24 for w in range(10)],
['week %i'%w for w in
range(10)])
plt.autoscale(tight=True)
plt.grid()
plt.show()
print(data)

3）、二阶模型

import scipy as
sp
import numpy as np
import matplotlib.pyplot as plt
def showData():
data = sp.genfromtxt("web_traffic.tsv",delimiter="t")
x = data[:,0]
y = data[:,1]
nan = sp.sum(sp.isnan(y))
x=x[~sp.isnan(y)]
y = y[~sp.isnan(y)]
fp1,residuals,rank,sv,rcond=sp.polyfit(x,y,1,full=True)
print("Model parameters %s" % fp1)
f1=sp.poly1d(fp1)
fx = sp.linspace(0,x[-1],1000)
plt.plot(fx,f1(fx),linewidth=4)
plt.legend(["d=%i" % f1.order],loc="upper left")
f2p=sp.polyfit(x,y,2)
f2=sp.poly1d(f2p)
plt.plot(fx,f2(fx),linewidth=4)
print(f2.order)
plt.legend("d=%i" % f2.order,loc="upper left")
plt.scatter(x,y)
plt.title("web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hitshour")
plt.xticks([w * 7 * 24 for w in range(10)],
['week %i'%w for w in
range(10)])
plt.autoscale(tight=True)
plt.grid()
plt.show()
print(data)

4）、组合模型

# -*- coding: utf-8 -*-
import os
import scipy as sp
import matplotlib.pyplot as plt
data_dir = os.path.join(
os.path.dirname(os.path.realpath(__file__)), ".", "data")
data = sp.genfromtxt(os.path.join(data_dir, "web_traffic.tsv"), delimiter="t")
print(data[:10])
# all examples will have three classes in this file
colors = ['g', 'k', 'b', 'm', 'r']
linestyles = ['-', '-.', '--', ':', '-']
x = data[:, 0]
y = data[:, 1]
print("Number of invalid entries:", sp.sum(sp.isnan(y)))
x = x[~sp.isnan(y)]
y = y[~sp.isnan(y)]
# plot input data
def plot_models(x, y, models, fname, mx=None, ymax=None, xmin=None):
plt.clf()
plt.scatter(x, y, s=10)
plt.title("Web traffic over the last month")
plt.xlabel("Time")
plt.ylabel("Hits/hour")
plt.xticks(
[w * 7 * 24 for w in range(10)], ['week %i' % w for w in range(10)])
if models:
if mx is None:
mx = sp.linspace(0, x[-1], 1000)
for model, style, color in zip(models, linestyles, colors):
# print "Model:",model
# print "Coeffs:",model.coeffs
plt.plot(mx, model(mx), linestyle=style, linewidth=2, c=color)
plt.legend(["d=%i" % m.order for m in models], loc="upper left")
plt.autoscale(tight=True)
plt.ylim(ymin=0)
if ymax:
plt.ylim(ymax=ymax)
if xmin:
plt.xlim(xmin=xmin)
plt.grid(True, linestyle='-', color='0.75')
plt.savefig(fname)
# first look at the data
plot_models(x, y, None, os.path.join("..", "1400_01_01.png"))
# create and plot models
fp1, res, rank, sv, rcond = sp.polyfit(x, y, 1, full=True)
print("Model parameters: %s" % fp1)
print("Error of the model:", res)
f1 = sp.poly1d(fp1)
f2 = sp.poly1d(sp.polyfit(x, y, 2))
f3 = sp.poly1d(sp.polyfit(x, y, 3))
f10 = sp.poly1d(sp.polyfit(x, y, 10))
f100 = sp.poly1d(sp.polyfit(x, y, 100))
plot_models(x, y, [f1], os.path.join("..", "1400_01_02.png"))
plot_models(x, y, [f1, f2], os.path.join("..", "1400_01_03.png"))
plot_models(
x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_04.png"))
# fit and plot a model using the knowledge about inflection point
inflection = 3.5 * 7 * 24
xa = x[:inflection]
ya = y[:inflection]
xb = x[inflection:]
yb = y[inflection:]
fa = sp.poly1d(sp.polyfit(xa, ya, 1))
fb = sp.poly1d(sp.polyfit(xb, yb, 1))
plot_models(x, y, [fa, fb], os.path.join("..", "1400_01_05.png"))
def error(f, x, y):
return sp.sum((f(x) - y) ** 2)
print("Errors for the complete data set:")
for f in [f1, f2, f3, f10, f100]:
print("Error d=%i: %f" % (f.order, error(f, x, y)))
print("Errors for only the time after inflection point")
for f in [f1, f2, f3, f10, f100]:
print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
print("Error inflection=%f" % (error(fa, xa, ya) + error(fb, xb, yb)))
# extrapolating into the future
plot_models(
x, y, [f1, f2, f3, f10, f100], os.path.join("..", "1400_01_06.png"),
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
ymax=10000, xmin=0 * 7 * 24)
print("Trained only on data after inflection point")
fb1 = fb
fb2 = sp.poly1d(sp.polyfit(xb, yb, 2))
fb3 = sp.poly1d(sp.polyfit(xb, yb, 3))
fb10 = sp.poly1d(sp.polyfit(xb, yb, 10))
fb100 = sp.poly1d(sp.polyfit(xb, yb, 100))
print("Errors for only the time after inflection point")
for f in [fb1, fb2, fb3, fb10, fb100]:
print("Error d=%i: %f" % (f.order, error(f, xb, yb)))
plot_models(
x, y, [fb1, fb2, fb3, fb10, fb100], os.path.join("..", "1400_01_07.png"),
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
ymax=10000, xmin=0 * 7 * 24)
# separating training from testing data
frac = 0.3
split_idx = int(frac * len(xb))
shuffled = sp.random.permutation(list(range(len(xb))))
test = sorted(shuffled[:split_idx])
train = sorted(shuffled[split_idx:])
fbt1 = sp.poly1d(sp.polyfit(xb[train], yb[train], 1))
fbt2 = sp.poly1d(sp.polyfit(xb[train], yb[train], 2))
fbt3 = sp.poly1d(sp.polyfit(xb[train], yb[train], 3))
fbt10 = sp.poly1d(sp.polyfit(xb[train], yb[train], 10))
fbt100 = sp.poly1d(sp.polyfit(xb[train], yb[train], 100))
print("Test errors for only the time after inflection point")
for f in [fbt1, fbt2, fbt3, fbt10, fbt100]:
print("Error d=%i: %f" % (f.order, error(f, xb[test], yb[test])))
plot_models(
x, y, [fbt1, fbt2, fbt3, fbt10, fbt100], os.path.join("..",
"1400_01_08.png"),
mx=sp.linspace(0 * 7 * 24, 6 * 7 * 24, 100),
ymax=10000, xmin=0 * 7 * 24)
from scipy.optimize import fsolve
print(fbt2)
print(fbt2 - 100000)
reached_max = fsolve(fbt2 - 100000, 800) / (7 * 24)
print("100,000 hits/hour expected at week %f" % reached_max[0])