爬取(明星网)明星面部数据
1
2
3
4
5from bs4 import BeautifulSoup import os import requests import time
1 下载数据
1.1 请求分析
- Request
1
2
3
4
5GET /upload/thumb/2015/11-16/0-uwo1Wk.jpg HTTP/1.1 Host: img.mingxing.com Referer:http://img.mingxing.com//mingxing//20181015/88aa35c304dc06e822bb2efdd33497a5.jpg User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400
1
2
3
4
5
6
7
8
9
10
11
12
13
14def get_img(url, path): headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400", "Referer": url, } response = requests.get(url, headers=headers) # print(response.content) with open(path, "wb") as fw: fw.write(response.content)
1
2
3
4
5
6
7if __name__ == "__main__": url = "http://img.mingxing.com//mingxing//20181015/88aa35c304dc06e822bb2efdd33497a5.jpg" path = "./dataset/tmp.jpg" get_img(url, path)
2 明星列表页面
- Request
1
2
3
4
5
6
7
8
9
10GET /ziliao/index?&p=1 HTTP/1.1 Host: www.mingxing.com Connection: keep-alive User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400 Upgrade-Insecure-Requests: 1 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Cookie: __51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553821270; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A20%7D; __tins__18838395=%7B%22sid%22%3A%201553825001869%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201553827085186%7D; __51laig__=21
2.1 单页明星列表
1
2URL_MINGXING_CELEBRITY_LIST = "http://www.mingxing.com/ziliao/index"
1
2
3
4
5
6
7<div>:class="page_starlist",明星列表 -><ul> --><li> ---><a>:明星页面url ----><span> -----><img>:src - 明星图片url,alt = 明星姓名
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33def get_celebrities_one_page(url, idx_page): headers={ "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400", "Upgrade-Insecure-Requests": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "__51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553821270; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A20%7D; __tins__18838395=%7B%22sid%22%3A%201553825001869%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201553827085186%7D; __51laig__=21", } params = { "p": idx_page } response = requests.get(url, params=params, headers=headers) html = response.text # print(html) soup = BeautifulSoup(html, 'lxml') # print(soup.find("div", class_="page_starlist").find_all("img")) lst_celebrities = [] for item in soup.find("div", class_="page_starlist").find_all("img"): lst_celebrities.append({"name": item.get("alt").strip(), "url": "http://www.mingxing.com" + item.find_parent("a").get("href"), "img_urls": [item.get("src")]}) # print(item.find_parent("a")["href"]) # print(item["src"], item["alt"]) return lst_celebrities
1
2
3
4if __name__ == "__main__": idx_page = 1 print(get_celebrities_one_page(URL_MINGXING_CELEBRITY_LIST, idx_page))
2.2 多页明星列表
1
2NUM_PAGES = 10
1
2
3
4
5
6
7
8
9
10def get_celebrities(url, num_pages): lst_celebrities = [] for idx_page in range(1, num_pages): lst_celebrities.extend( get_celebrities_one_page(url, idx_page)) time.sleep(3) return lst_celebrities
1
2
3
4if __name__ == "__main__": lst_celebrities = get_celebrities(URL_MINGXING_CELEBRITY_LIST, NUM_PAGES) print(lst_celebrities)
[{‘name’: ‘鹿晗’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/luhan.html’, ‘img_urls’: [‘http://img.mingxing.com/upload/thumb/6/17097.jpg’]}, {‘name’: ‘迪丽热巴’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/dilireba.html’, ‘img_urls’:
…
[‘http://img.mingxing.com/upload/thumb/5/14274.jpg’]}, {‘name’: ‘王艺洁’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/wangyijie.html’, ‘img_urls’: [‘http://img.mingxing.com/upload/thumb/5/14276.jpg’]}, {‘name’: ‘段林希’, ‘url’: ‘http://www.mingxing.com/mingxing/index/name/duanlinxi.html’, ‘img_urls’: [‘http://img.mingxing.com/upload/thumb/5/14277.jpg’]}]
3 明星页面
1
2
3
4
5
6
7
8
9
10
11GET /mingxing/index/name/luhan.html HTTP/1.1 Host: www.mingxing.com Connection: keep-alive Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400 Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Referer: http://www.mingxing.com/ziliao/index?&p=1 Accept-Encoding: gzip, deflate Accept-Language: zh-CN,zh;q=0.9 Cookie: __51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A29%7D; __tins__18838395=%7B%22sid%22%3A%201553844269026%2C%20%22vd%22%3A%201%2C%20%22expires%22%3A%201553846069026%7D; __51laig__=30; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553843231
1
2
3
4
5
6<ul>:class="page_starphoto",明星列表 -><li> --><a> ---><span> ----><img>:src - 明星图片url
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26def get_celebrity_img_urls(url): headers={ "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6821.400 QQBrowser/10.3.3040.400", "Upgrade-Insecure-Requests": "1", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "__51cke__=; UM_distinctid=169c7147623b32-041b6f54ccddbe-3257487f-232800-169c7147624ddd; CNZZDATA30054349=cnzz_eid%3D515205138-1553821270-https%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1553821270; PHPSESSID=9btdnv30htpj54ies9em19pan1; right_adv=%7B%22time%22%3A%222019329%22%2C%22number%22%3A20%7D; __tins__18838395=%7B%22sid%22%3A%201553825001869%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201553827085186%7D; __51laig__=21", # "Referer": url, } response = requests.get(url, headers=headers) html = response.text soup = BeautifulSoup(html, 'lxml') lst_imgs = [] for item in soup.find("ul", class_="page_starphoto").find_all("img"): lst_imgs.append(item["src"]) # print(item["src"]) return lst_imgs
1
2
3
4if __name__ == "__main__": get_celebrity_img_urls("http://www.mingxing.com/mingxing/index/name/luhan.html")
4 创建明星面部数据集
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31if __name__ == "__main__": NUM_PAGES = 10 DATASET_PATH = "./dataset" # 明星列表 lst_celebrities = get_celebrities(URL_MINGXING_CELEBRITY_LIST, NUM_PAGES) for celebrity in lst_celebrities: # 明星文件夹 celebrity_dir = os.path.join(DATASET_PATH, celebrity["name"]) print("*" * 10) print("celebrity: {}".format(celebrity["name"])) if not os.path.exists(celebrity_dir): os.makedirs(celebrity_dir) # 明星页面 celebrity["img_urls"].extend(get_celebrity_img_urls(celebrity["url"])) idx_img = 0 for img_url in celebrity["img_urls"]: idx_img += 1 img_path = os.path.join(celebrity_dir, "{:04d}.jpg".format(idx_img)) get_img(img_url, img_path) print("download {} ---> {}".format(img_url, img_path)) time.sleep(3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58********** celebrity: 鹿晗 download http://img.mingxing.com/upload/thumb/6/17097.jpg ---> ./dataset鹿晗001.jpg download http://img.mingxing.com/mingxing//20180928/2e8dc41ba5f72d2e0ed005541a515a54.jpg ---> ./dataset鹿晗002.jpg download http://img.mingxing.com/mingxing//20180319/c84bf559d0dd0e2fae005f84a4016f6c.jpg ---> ./dataset鹿晗003.jpg download http://img.mingxing.com/upload/thumb/2017/06-28/0-5QoNse.jpg ---> ./dataset鹿晗004.jpg download http://img.mingxing.com/upload/thumb/2017/05-02/0-gOgsmr.jpg ---> ./dataset鹿晗005.jpg download http://img.mingxing.com/upload/thumb/2016/12-20/0-2GSIij.jpg ---> ./dataset鹿晗006.jpg download http://img.mingxing.com/upload/thumb/2016/07-07/0-yaTLqz.jpg ---> ./dataset鹿晗007.jpg download http://img.mingxing.com/upload/thumb/2016/04-21/0-iDv7Fj.jpg ---> ./dataset鹿晗008.jpg download http://img.mingxing.com/upload/thumb/2016/04-11/0-FTCO8H.jpg ---> ./dataset鹿晗009.jpg download http://img.mingxing.com/upload/thumb/2016/03-21/0-op5Sbt.jpg ---> ./dataset鹿晗010.jpg download http://img.mingxing.com/upload/thumb/2015/12-30/0-08NWI0.jpg ---> ./dataset鹿晗011.jpg download http://img.mingxing.com/upload/thumb/2015/12-08/0-wlgnGF.jpg ---> ./dataset鹿晗012.jpg download http://img.mingxing.com/upload/thumb/2015/11-16/0-uwo1Wk.jpg ---> ./dataset鹿晗013.jpg ********** celebrity: 迪丽热巴 download http://img.mingxing.com/content/20180103/535f03beaa9b7f0cb3c6f2f302886bf8.jpg ---> ./dataset迪丽热巴001.jpg download http://img.mingxing.com/mingxing//20181015/14b77dfea0cad1360955d818fcbb0de6.jpg ---> ./dataset迪丽热巴002.jpg download http://img.mingxing.com/mingxing//20180921/28e35a28498d760e908abce74fd40f5f.jpg ---> ./dataset迪丽热巴003.jpg download http://img.mingxing.com/mingxing//20180726/17702f5a9b8b998cbb0c70c260b40ad3.gif ---> ./dataset迪丽热巴004.jpg download http://img.mingxing.com/mingxing//20180620/ea20b15f13f6b34d1b4764553bfba7a9.png ---> ./dataset迪丽热巴005.jpg download http://img.mingxing.com/mingxing//20180417/985a84ccae9646f31f4dd717ccd40508.jpg ---> ./dataset迪丽热巴006.jpg download http://img.mingxing.com/mingxing//20180411/5376e604692d6fb42ae7a48e73143eb8.jpg ---> ./dataset迪丽热巴007.jpg download http://img.mingxing.com/mingxing/20180301/bdd3cbbf262d7793f21ed10975744c22.jpg ---> ./dataset迪丽热巴008.jpg download http://img.mingxing.com/mingxing/20180301/3418a7189704f4e68f81a29b4320af87.jpg ---> ./dataset迪丽热巴009.jpg download http://img.mingxing.com/mingxing/20180227/d6aa477ed34271c06fe9edb4dccc9e94.jpg ---> ./dataset迪丽热巴010.jpg download http://img.mingxing.com/mingxing/20180227/92dccee3c3ab96b8aae57f2f0469b1c2.jpg ---> ./dataset迪丽热巴011.jpg download http://img.mingxing.com/mingxing/20180226/0fc7ff656cabc975cbb349daeb6ee793.jpg ---> ./dataset迪丽热巴012.jpg download http://img.mingxing.com/mingxing/20180225/45a68453086b2307eaf10b7921b7e199.jpg ---> ./dataset迪丽热巴013.jpg ... celebrity: 约翰尼·德普 download http://img.mingxing.com/upload/thumb/5/14261.jpg ---> ./dataset约翰尼·德普001.jpg download http://img.mingxing.com/upload/thumb/2016/05-24/0-re7Tem.jpg ---> ./dataset约翰尼·德普002.jpg download http://img.mingxing.com/upload/thumb/2016/04-13/0-X6RYXs.jpg ---> ./dataset约翰尼·德普003.jpg download http://img.mingxing.com/upload/thumb/2016/03-25/0-bxK5os.jpg ---> ./dataset约翰尼·德普004.jpg download http://img.mingxing.com/upload/thumb/2016/03-25/0-h77lr9.jpg ---> ./dataset约翰尼·德普005.jpg download http://img.mingxing.com/upload/thumb/2016/03-17/0-U3Y3EK.jpg ---> ./dataset约翰尼·德普006.jpg download http://img.mingxing.com/upload/thumb/2016/03-17/0-WUdojP.jpg ---> ./dataset约翰尼·德普007.jpg download http://img.mingxing.com/upload/thumb/2016/03-17/0-ghntJ4.jpg ---> ./dataset约翰尼·德普008.jpg download http://img.mingxing.com/upload/thumb/2016/02-26/0-G2Th8a.jpg ---> ./dataset约翰尼·德普009.jpg download http://img.mingxing.com/upload/thumb/2016/02-23/0-cARUg7.jpg ---> ./dataset约翰尼·德普010.jpg download http://img.mingxing.com/upload/thumb/2016/02-18/0-DLYZNo.jpg ---> ./dataset约翰尼·德普011.jpg download http://img.mingxing.com/upload/thumb/2016/01-29/0-Pe5YMh.jpg ---> ./dataset约翰尼·德普012.jpg ********** celebrity: 雨果·维文 download http://img.mingxing.com/upload/thumb/5/14262.jpg ---> ./dataset雨果·维文001.jpg download http://img.mingxing.com/upload/thumb/2016/04-13/0-Pm9m6p.jpg ---> ./dataset雨果·维文002.jpg download http://img.mingxing.com/upload/thumb/2016/04-13/0-kqjoN7.jpg ---> ./dataset雨果·维文003.jpg download http://img.mingxing.com/upload/thumb/2016/04-08/0-03NXtB.jpg ---> ./dataset雨果·维文004.jpg download http://img.mingxing.com/upload/thumb/2016/03-30/0-TJRqeD.jpg ---> ./dataset雨果·维文005.jpg download http://img.mingxing.com/upload/thumb/2016/02-26/0-Wuurq1.jpg ---> ./dataset雨果·维文006.jpg download http://img.mingxing.com/upload/thumb/2016/02-18/0-4fqOgM.jpg ---> ./dataset雨果·维文007.jpg ********** celebrity: 希亚·拉博夫
最后
以上就是刻苦可乐最近收集整理的关于爬取(明星网)明星面部数据爬取(明星网)明星面部数据的全部内容,更多相关爬取(明星网)明星面部数据爬取(明星网)明星面部数据内容请搜索靠谱客的其他文章。
发表评论 取消回复