我是靠谱客的博主 拉长西牛,最近开发中收集的这篇文章主要介绍机器学习实战第三章课后练习答案,觉得挺不错的,现在分享给大家,希望可以做个参考。

概述

3.8 练习题

数据集准备

import numpy as np
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)
X,y = mnist["data"], mnist["target"]
y = y.astype(np.uint8) # 将字符型标记转换为数值型
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

1. 超过97%准确率的分类器

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
param_grid = [{'weights':["uniform","distance"], 'n_neighbors':[3,4,5]}]
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_score_)

from sklearn.metrics import accuracy_score

y_pred = grid_search.predict(X_test)
print(accuracy_score(y_test, y_pred))

2. 数据增广

from scipy.ndimage.interpolation import shift
from sklearn.neighbors import KNeighborsClassifier

def shift_image(image, dx, dy):
    image = image.reshape((28, 28))
    shifted_image = shift(image, [dx, dy], cval=0, mode="constant")
    return shifted_image.reshape([-1])

image = X_train[1000]
shifted_image_down = shift_image(image, 0, 5)
shifted_image_left = shift_image(image, -5, 0)
plt.figure(figsize=(12, 3))
plt.title("Original", fontsize=14)
plt.imshow(image.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.title("Shifted down", fontsize=14)
plt.imshow(shifted_image_down.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.title("Shifted left", fontsize=14)
plt.imshow(shifted_image_left.reshape(28, 28), interpolation="nearest", cmap="Greys")
plt.show()

X_train_augmented = [image for image in X_train]
y_train_augmented = [label for label in y_train]
for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)

X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

knn_clf = KNeighborsClassifier(**grid_search.best_params_)
knn_clf.fit(X_train_augmented, y_train_augmented)
y_pred = knn_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

3. 处理泰坦尼克号数据集

其目标是根据乘客的年龄、性别、乘客级别、出发地点等属性来预测他们是否幸存。

# -------------下载titanic数据集-------------
import os
import urllib.request

TITANIC_PATH = os.path.join("dataset", "titanic")
DOWNLOAD_URL = "https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/titanic/"

def fecth_titanic_data(url=DOWNLOAD_URL, path=TITANIC_PATH):
    if not os.path.isdir(path):
        os.makedirs(path)
    for filename in ('train.csv', 'test.csv'):
        filepath = os.path.join(path, filename)
        if not os.path.isfile(filepath):
            print('Downloading', filename)
            urllib.request.urlretrieve(url + filename, filepath)

# fecth_titanic_data()

# -------------加载并处理数据集-------------
import pandas as pd
import numpy as np

def load_titanic_data(filename, titanic_path=TITANIC_PATH):
    csv_path = os.path.join(titanic_path, filename)
    return pd.read_csv(csv_path)

train_data = load_titanic_data('train.csv')
test_data = load_titanic_data('test.csv')

# 将PassengerId设为索引
train_data = train_data.set_index("PassengerId")
test_data = test_data.set_index("PassengerId")

# 建立数据处理流水线
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipeline = Pipeline([  # 数值型数据处理流水线
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([  # 分类数据处理流水线
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('cat_encoder', OneHotEncoder(sparse=False))
])

# 定义转换器
from sklearn.compose import ColumnTransformer

num_attribs = ["Age", "SibSp", "Parch", "Fare"]
cat_attribs = ["Pclass", "Sex", "Embarked"]

preprocess_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs),
])

# 处理数据
X_train = preprocess_pipeline.fit_transform(train_data[num_attribs+cat_attribs])
y_trian = train_data['Survived']

# -------------模型训练-------------
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_trian)

X_test = preprocess_pipeline.fit_transform(test_data[num_attribs+cat_attribs])
y_pred = forest_clf.predict(X_test)

# -------------模型评估和验证-------------
from sklearn.model_selection import cross_val_score

forest_score = cross_val_score(forest_clf, X_train, y_trian, cv=10)
print(forest_score.mean())

# 换一种模型然后评估
from sklearn.svm import SVC

svm_clf = SVC(gamma='auto')
svm_score = cross_val_score(svm_clf, X_train, y_trian, cv=10)
print(svm_score.mean())

svm_clf.fit(X_train, y_trian)
y_pred_svc = svm_clf.predict(X_test)
result = pd.DataFrame(y_pred_svc, index=test_data.index)
result.columns = ['Survived']
result.to_csv("gender submission.csv")

# 绘制箱型图
import matplotlib.pyplot as plt

plt.figure(figsize=(8,4))
plt.plot([1]*10, svm_score, '.')
plt.plot([2]*10, svm_score, '.')
plt.boxplot([svm_score, forest_score], labels=("SVM","Random Forset"))
plt.ylabel("Accuracy")
plt.show()

在这里插入图片描述

4. 处理垃圾邮件

# ------------下载邮件------------
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("dataset", "spam")

def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

# fetch_spam_data()

# ------------加载数据集------------
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

# ------------解析邮件------------
import email
import email.parser
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

# ------------分析电子邮件结构------------
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

# print(structures_counter(ham_emails).most_common())
# print(structures_counter(spam_emails).most_common())
# 正常电子邮件更多的是纯文本,而垃圾邮件有相当多的HTML。
# 此外,相当多的正常是用PGP签名的,而垃圾邮件是没有的。
# 简而言之,电子邮件结构似乎是有用的信息。

# ------------切分数据集------------
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ------------处理邮件格式------------
# 使用BeautifulSoup或正则表达式,下面使用后者
import re
from html import unescape

def html_to_plain_text(html):  # 将html格式的邮件处理为文本格式
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<as.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(s*n)+', 'n', text, flags=re.M | re.S)
    return unescape(text)

def email_to_text(email):  # 定义一个无论何种格式email输入都输出文本格式的函数
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

# ------------将派生词转化为词干------------
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("缺失NLTK模块")
    stemmer = None

# ------------替换URL------------
try:
    import urlextract

    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls("是否能识别 github.com 和 https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("缺失urlextract模块")
    url_extractor = None

# ------------定义转换器------------
from sklearn.base import BaseEstimator, TransformerMixin
from scipy.sparse import csr_matrix

# 转换器一:单词计数器
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True, replace_url=True,
                 replace_number=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_url = replace_url
        self.replace_number = replace_number
        self.stemming = stemming

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_url and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, "URL ")
            if self.replace_number:
                text = re.sub(r'd+(?:.d*)?(?:[eE][+-]?d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'W+', ' ', text, flags=re.M)
            word_counts = Counter()
            if self.stemming and stemmer is not None:
                stemed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemed_word = stemmer.stem(word)
                    stemed_word_counts[stemed_word] += count
                word_counts = stemed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

# 转换器二:单机计数器转向量
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size

    def fit(self, X, y=None):
        total_count =   Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self

    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size+1))

# ------------训练分类器------------
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

preprocess_line = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer())
])

X_train_transformed = preprocess_line.fit_transform(X_train)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)

from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_line.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("精确度: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("召回率: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

最后

以上就是拉长西牛为你收集整理的机器学习实战第三章课后练习答案的全部内容,希望文章能够帮你解决机器学习实战第三章课后练习答案所遇到的程序开发问题。

如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。

本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
点赞(25)

评论列表共有 0 条评论

立即
投稿
返回
顶部