python爬虫——验证码识别python爬虫——验证码识别

128 阅读 0 评论 85 点赞

我是靠谱客的博主魁梧秋天，这篇文章主要介绍python爬虫——验证码识别python爬虫——验证码识别，现在分享给大家，希望可以做个参考。

python爬虫——验证码识别

识别工具：斐斐打码
使用流程：

单击后，点击用户中心。
进行充值
查看PD账号和PD密钥（后续会用到）
进入开发者板块
查看AppID和AppKey（后续会用到）
进入开发文档
在此可以查看不同验证码的类型
下载python3
解压得到
使用斐斐打码平台
将其打包为一个函数：

复制代码

#imgPath为文件路径，codeType为需要识别的验证码类型
def getCode(imgPath, codeType):
    pd_id = "125987"  # 用户中心页可以查询到pd信息
    pd_key = "cgIQVFmyJT8wZE45fZu35GmYgzNNT3x0"
    app_id = "325987"  # 开发者分成用的账号，在开发者中心可以查询到
    app_key = "8nqa1Gs32QksC9+uLkZxploBxcUpHYBY"
    # 识别类型，
    # 具体类型可以查看官方网站的价格页选择具体的类型，不清楚类型的，可以咨询客服
    pred_type = codeType
    api = FateadmApi(app_id, app_key, pd_id, pd_key)
    # 查询余额
    #balance = api.QueryBalcExtend()  # 直接返余额
    # api.QueryBalc()

# 通过文件形式识别：
    file_name = imgPath
    # 多网站类型时，需要增加src_url参数，具体请参考api文档: http://docs.fateadm.com/web/#/1?page_id=6
    # result =  api.PredictFromFileExtend(pred_type,file_name)   # 直接返回识别结果
    rsp = api.PredictFromFile(pred_type, file_name)  # 返回详细识别结果

'''
    # 如果不是通过文件识别，则调用Predict接口：
    # result 			= api.PredictExtend(pred_type,data)   	# 直接返回识别结果
    rsp             = api.Predict(pred_type,data)				# 返回详细的识别结果
    '''

just_flag = False
    if just_flag:
        if rsp.ret_code == 0:
            # 识别的结果如果与预期不符，可以调用这个接口将预期不符的订单退款
            # 退款仅在正常识别出结果后，无法通过网站验证的情况，请勿非法或者滥用，否则可能进行封号处理
            api.Justice(rsp.request_id)

# card_id         = "123"
    # card_key        = "123"
    # 充值
    # api.Charge(card_id, card_key)
    #LOG("print in testfunc")
    return rsp.pred_rsp.value`

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
#imgPath为文件路径，codeType为需要识别的验证码类型
def getCode(imgPath, codeType):
    pd_id = "125987"  # 用户中心页可以查询到pd信息
    pd_key = "cgIQVFmyJT8wZE45fZu35GmYgzNNT3x0"
    app_id = "325987"  # 开发者分成用的账号，在开发者中心可以查询到
    app_key = "8nqa1Gs32QksC9+uLkZxploBxcUpHYBY"
    # 识别类型，
    # 具体类型可以查看官方网站的价格页选择具体的类型，不清楚类型的，可以咨询客服
    pred_type = codeType
    api = FateadmApi(app_id, app_key, pd_id, pd_key)
    # 查询余额
    #balance = api.QueryBalcExtend()  # 直接返余额
    # api.QueryBalc()

    # 通过文件形式识别：
    file_name = imgPath
    # 多网站类型时，需要增加src_url参数，具体请参考api文档: http://docs.fateadm.com/web/#/1?page_id=6
    # result =  api.PredictFromFileExtend(pred_type,file_name)   # 直接返回识别结果
    rsp = api.PredictFromFile(pred_type, file_name)  # 返回详细识别结果

    '''
    # 如果不是通过文件识别，则调用Predict接口：
    # result 			= api.PredictExtend(pred_type,data)   	# 直接返回识别结果
    rsp             = api.Predict(pred_type,data)				# 返回详细的识别结果
    '''

    just_flag = False
    if just_flag:
        if rsp.ret_code == 0:
            # 识别的结果如果与预期不符，可以调用这个接口将预期不符的订单退款
            # 退款仅在正常识别出结果后，无法通过网站验证的情况，请勿非法或者滥用，否则可能进行封号处理
            api.Justice(rsp.request_id)

    # card_id         = "123"
    # card_key        = "123"
    # 充值
    # api.Charge(card_id, card_key)
    #LOG("print in testfunc")
    return rsp.pred_rsp.value`

识别流程：
1.将验证码图片保存到本地
2.调用getCode函数进行识别

注：本人将fateadm_api,py文件重命名为Code.py

复制代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
import requests
import Code
from lxml import etree

if __name__ == '__main__':
    url = 'https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx'
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'
     }

    page_text = requests.get(url=url, headers=headers).text

    tree = etree.HTML(page_text)
    #解析验证码图片的路径
    code_img_src = 'https://so.gushiwen.cn' + tree.xpath('//*[@id="imgCode"]/@src')[0]

    #得到验证码图片的二进制
    code_img_data = requests.get(url=code_img_src, headers=headers).content
    #存储路径
    imgPath = './code.jpg'

    #将其存储
    with open(imgPath, 'wb') as fp:
        fp.write(code_img_data)

    #得到验证码
    code_text = Code.getCode(imgPath=imgPath, codeType='30400')
    print(code_text)