概述
import requests
from bs4 import BeautifulSoup
import threading
url_num = 0
url_list = ['http://ubuntuforums.org/forumdisplay.php?f=333',]
for x in range(1, 50):
url_num += 1
raw_url = 'http://ubuntuforums.org/forumdisplay.php?f=333&page=%d' % url_num
url_list.append(raw_url)
class MyThread(threading.Thread):
def __init__(self, func, args, name=""):
threading.Thread.__init__(self)
self.func = func
self.args = args
self.name = name
def run(self):
apply(self.func, self.args)
def running(url):
# lock.acquire()
html = requests.get(url)
if html.status_code == 200:
html_text = html.text
soup = BeautifulSoup(html_text)
with open('/home/zhg/Pictures/cao.txt', 'a+') as f:
for link in soup.find_all('a', 'title'):
s = 'http://ubuntuforums.org/' + str(link.get('href')) + ' ' + str(link.get_text().encode('utf-8'))
f.writelines(s)
f.writelines('n')
# lock.release()
if __name__ == '__main__':
thread_list = [ MyThread(running, (url, ), running.__name__) for url in url_list ]
for t in thread_list:
t.setDaemon(True)
t.start()
for i in thread_list:
i.join()
print "process ended"
with open('/home/zhg/Pictures/cao.txt', 'r') as f:
f_list = f.readlines()
set_list = set(f_list)
for x in set_list:
if f_list.count(x) > 1:
print "the has found " % (x, f_list.count(x))
而且如何加锁的话运行速率和直接用for循环不用多线程一样,这是为什么?
额,这个问题解决了,问题留着给其他人参考。
数据在存储到文件之前没有去重复,而爬数据的网页上有置顶的文章,所以爬了50多页,三条置顶的文章出现了40多次。
我还以为是没加锁的关系。
玩蛇网文章,转载请注明出处和文章网址:https://www.iplaypy.com/wenda/wd18698.html
相关文章 Recommend
最后
以上就是忧郁糖豆为你收集整理的python多线程下载小说章节重复_求推荐Python多线程爬虫重复内容处理思路的全部内容,希望文章能够帮你解决python多线程下载小说章节重复_求推荐Python多线程爬虫重复内容处理思路所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
发表评论 取消回复