scrapy+selenium爬取五个外国新闻网站关于“中国“的新闻，并分析1.题目2.scrapy+selenium爬取动态网站3.最终结果4.下载

59 阅读 0 评论 39 点赞

我是靠谱客的博主丰富灯泡，这篇文章主要介绍scrapy+selenium爬取五个外国新闻网站关于“中国“的新闻，并分析1.题目2.scrapy+selenium爬取动态网站3.最终结果4.下载，现在分享给大家，希望可以做个参考。

1.题目

爬取一下5家的加拿大新闻网站包含“China” /
"Chinese"的数据，时间范围为2020年1月日至今，分析每家媒体文章的词云图、情感趋势图等。
https://www.cbc.ca/
https://nationalpost.com/
https://www.thestar.com/?redirect=true
https://www.ctvnews.ca/
https://globalnews.ca/national/

2.scrapy+selenium爬取动态网站

以CBC网站爬取为例,spider编写如下

复制代码

import scrapy
from ..items import CbcItem
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from threading import Thread, Lock
class ChinaSpider(scrapy.Spider):
    name = 'china'
    allowed_domains = ['https://www.cbc.ca/']
    start_urls = ['https://www.cbc.ca/search?q=china§ion=news']

def __init__(self):
        # 在初始化时，创建driver
        super(ChinaSpider, self).__init__(name='china')
        option = FirefoxOptions()
        option.headless = True
        self.driver = webdriver.Firefox(options=option)

def parse(self, response):
        tr_list = response.xpath("//div[@class='contentListCards']/a")
        for tr in tr_list:
            item = CbcItem()
            href = tr.xpath("./@href").extract_first()
            time = tr.xpath(
                    "./div/div/div[@class='card-content-bottom']/div/div/time/@datetime").extract_first()
            year = time[:4]
            month = time[5:7]
            print(year+month)
            if int(month) >=1 and int(year) == 2020:
                if (href[1:5] == 'news'):
                    item["title"] = tr.xpath("./div/div/div[@class='card-content-top']/h3/text()").extract_first()
                    item["brief"] = tr.xpath(
                        "./div/div/div[@class='card-content-top']/div[@id='d-card-']/text()").extract_first()
                    item['herf'] = 'https://www.cbc.ca' + href
                    item['publish_date'] = tr.xpath(
                        "./div/div/div[@class='card-content-bottom']/div/div/time/@datetime").extract_first()
                   # 详情页爬取
                    yield scrapy.Request(
                        item['herf'],
                        callback=self.parse_detail,
                        meta={"item": item},
                        dont_filter=True
                    )
           
    def parse_detail(self, response):
        item = response.meta['item']
        item["content"] = response.xpath("//div[@class='story']/span/p/text()").extract()
        item["content"] = "".join(item["content"])
        print(item)
        yield(item)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import scrapy
from ..items import CbcItem
from selenium import webdriver
from selenium.webdriver import FirefoxOptions
from threading import Thread, Lock
class ChinaSpider(scrapy.Spider):
    name = 'china'
    allowed_domains = ['https://www.cbc.ca/']
    start_urls = ['https://www.cbc.ca/search?q=china&section=news']

    def __init__(self):
        # 在初始化时，创建driver
        super(ChinaSpider, self).__init__(name='china')
        option = FirefoxOptions()
        option.headless = True
        self.driver = webdriver.Firefox(options=option)

    def parse(self, response):
        tr_list = response.xpath("//div[@class='contentListCards']/a")
        for tr in tr_list:
            item = CbcItem()
            href = tr.xpath("./@href").extract_first()
            time = tr.xpath(
                    "./div/div/div[@class='card-content-bottom']/div/div/time/@datetime").extract_first()
            year = time[:4]
            month = time[5:7]
            print(year+month)
            if int(month) >=1 and int(year) == 2020:
                if (href[1:5] == 'news'):
                    item["title"] = tr.xpath("./div/div/div[@class='card-content-top']/h3/text()").extract_first()
                    item["brief"] = tr.xpath(
                        "./div/div/div[@class='card-content-top']/div[@id='d-card-']/text()").extract_first()
                    item['herf'] = 'https://www.cbc.ca' + href
                    item['publish_date'] = tr.xpath(
                        "./div/div/div[@class='card-content-bottom']/div/div/time/@datetime").extract_first()
                   # 详情页爬取
                    yield scrapy.Request(
                        item['herf'],
                        callback=self.parse_detail,
                        meta={"item": item},
                        dont_filter=True
                    )
           
    def parse_detail(self, response):
        item = response.meta['item']
        item["content"] = response.xpath("//div[@class='story']/span/p/text()").extract()
        item["content"] = "".join(item["content"])
        print(item)
        yield(item)

在middlewares中编写自己的Download方法,并在setting中修改
主要是完成浏览器对js页面的解析

复制代码

if spider.name == "china":
                spider.driver.get(request.url)
                # CBc爬虫的初始页面不断点击
                if (request.url == "https://www.cbc.ca/search?q=china§ion=news"):
                    print("我只运行了一次")
                    for i in range(70):
                        button = spider.driver.find_element_by_xpath(
                            "//*[@id='content']/div/div[4]/section/div[1]/div[2]/div/button")
                        spider.driver.execute_script("arguments[0].click();", button)
                        print("我已经点击了%d" % i)
                        time.sleep(2)

# 由于页面数据加载需要进行滚动，但并不是所有js动态数据都需要滚动。
                for x in range(1, 11, 2):
                    height = float(x) / 10
                    js = "document.documentElement.scrollTop = document.documentElement.scrollHeight * %f" % height
                    spider.driver.execute_script(js)
                    time.sleep(0.2)
                origin_code = spider.driver.page_source
                # 将源代码构造成为一个Response对象，并返回。
                res = HtmlResponse(url=request.url, encoding='utf8', body=origin_code, request=request)
                # res = Response(url=request.url, body=bytes(origin_code), request=request)
                return res

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
            if spider.name == "china":
                spider.driver.get(request.url)
                # CBc爬虫的初始页面不断点击
                if (request.url == "https://www.cbc.ca/search?q=china&section=news"):
                    print("我只运行了一次")
                    for i in range(70):
                        button = spider.driver.find_element_by_xpath(
                            "//*[@id='content']/div/div[4]/section/div[1]/div[2]/div/button")
                        spider.driver.execute_script("arguments[0].click();", button)
                        print("我已经点击了%d" % i)
                        time.sleep(2)

                # 由于页面数据加载需要进行滚动，但并不是所有js动态数据都需要滚动。
                for x in range(1, 11, 2):
                    height = float(x) / 10
                    js = "document.documentElement.scrollTop = document.documentElement.scrollHeight * %f" % height
                    spider.driver.execute_script(js)
                    time.sleep(0.2)
                origin_code = spider.driver.page_source
                # 将源代码构造成为一个Response对象，并返回。
                res = HtmlResponse(url=request.url, encoding='utf8', body=origin_code, request=request)
                # res = Response(url=request.url, body=bytes(origin_code), request=request)
                return res