正则小案例(手机号、邮箱、URL、身份证、古诗文爬虫)

165 阅读 0 评论 109 点赞

我是靠谱客的博主自信金毛，这篇文章主要介绍正则小案例(手机号、邮箱、URL、身份证、古诗文爬虫)，现在分享给大家，希望可以做个参考。

1.验证手机号码(第一位是1，第二位是[34578]中一位，其余位数字即可)

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
import re
text = '1357032****' # 敏感信息，自行定义
if text.__len__() != 11:
    print("phone number'length is error!")
else:
    try:
        ret = re.match('1[34578]d{9}', text)
        print(ret.group())
    except AttributeError:
        print('匹配到非数字字符', text)
    else:
        print('正确手机号码：', ret.group())

2.验证邮箱

复制代码

1
2
3
4
5
text = '594398***@163.com'  # 敏感信息，自行定义
ret =re.match('w+@[a-z0-9]+.[a-z]+', text) # @的前面可以是字母数字下划线，所以用w
print(ret.group())

3.验证URL

复制代码

1
2
3
4
5
text = 'https://www.runoob.com/python/python-exceptions.html'
ret = re.match('(http|https|ftp)://[^s]+', text)
print(ret.group())

4.验证身份证：

复制代码

1
2
3
4
5
text = '44262319980425***X'  # 敏感信息，自行定义
ret = re.match('d{17}[dxX]', text)
print(ret.group())

5.综合实战(古诗文爬虫)

复制代码

import re
import requests

def prase_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    resp = requests.get(url, headers=headers)
    text = resp.text
    titles = re.findall(r'<div class="sons">.*?<b>(.*?)</b>', text, re.DOTALL)
    dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>.*?<span>', text, re.DOTALL)
    authors = re.findall(r'<p class="source">.*?</span>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    contents_tags = re.findall(r'<div class="contson".*?>(.*?)</div>', text, re.DOTALL)
    contents = []
    for content in contents_tags:
        contents_temp = re.sub('<.*?>', '', content)
        contents.append(contents_temp.strip())
#方式一二三，自行打开/注释即可
    # 方式一：默认匹配每项输出作者，朝代，内容
    # for i in range(len(contents)):
    #     print('诗题：', titles[i])
    #     print('作者,朝代：', dynasties[i], authors[i])
    #     print(contents[i])
    #     print('#' * 50)

# 方式二：各个项目进行组合
    # test = []
    # for i in range(len(authors)):
    #     value = {
    #         'title': titles[i],
    #         'dynasty': dynasties[i],
    #         'author': authors[i],
    #         'content':contents[i]
    #     }
    #     test.append(value)
    # for i in test:
    #     print(i)
    #     print('#' * 50)

# 方式三：zip匹配索引相同的各个项目
    poems = []
    for value in zip(titles, dynasties, authors, contents):
        title, dynasty, author, content2 = value
        poem = {
            'title': title,
            'dynasty': dynasty,
            'author': author,
            'content': content2
        }
        poems.append(poem)
    for poem in poems:
        print(poem)
        print('#' * 80)

def main():
    for x in range(1,11):
        url = "https://www.gushiwen.org/default_%s.aspx" % x
        prase_page(url)

if __name__ == '__main__':
    main()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import re
import requests


def prase_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }
    resp = requests.get(url, headers=headers)
    text = resp.text
    titles = re.findall(r'<div class="sons">.*?<b>(.*?)</b>', text, re.DOTALL)
    dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>.*?<span>', text, re.DOTALL)
    authors = re.findall(r'<p class="source">.*?</span>.*?<a.*?>(.*?)</a>', text, re.DOTALL)
    contents_tags = re.findall(r'<div class="contson".*?>(.*?)</div>', text, re.DOTALL)
    contents = []
    for content in contents_tags:
        contents_temp = re.sub('<.*?>', '', content)
        contents.append(contents_temp.strip())
#方式一二三，自行打开/注释即可
    # 方式一：默认匹配每项输出作者，朝代，内容
    # for i in range(len(contents)):
    #     print('诗题：', titles[i])
    #     print('作者,朝代：', dynasties[i], authors[i])
    #     print(contents[i])
    #     print('#' * 50)

    # 方式二：各个项目进行组合
    # test = []
    # for i in range(len(authors)):
    #     value = {
    #         'title': titles[i],
    #         'dynasty': dynasties[i],
    #         'author': authors[i],
    #         'content':contents[i]
    #     }
    #     test.append(value)
    # for i in test:
    #     print(i)
    #     print('#' * 50)

    # 方式三：zip匹配索引相同的各个项目
    poems = []
    for value in zip(titles, dynasties, authors, contents):
        title, dynasty, author, content2 = value
        poem = {
            'title': title,
            'dynasty': dynasty,
            'author': author,
            'content': content2
        }
        poems.append(poem)
    for poem in poems:
        print(poem)
        print('#' * 80)


def main():
    for x in range(1,11):
        url = "https://www.gushiwen.org/default_%s.aspx" % x
        prase_page(url)


if __name__ == '__main__':
    main()