我是靠谱客的博主 优美雪糕,最近开发中收集的这篇文章主要介绍统计ieee, springer, arxiv, sciencedirect, acm论文中作者信息,觉得挺不错的,现在分享给大家,希望可以做个参考。
概述
先给代码https://github.com/lyq998/Authors
主要是从网页里面爬取论文的作者名字,最后进行数量的统计
arxiv
import requests
import re
my_headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def arxiv_authors(url):
try:
response = requests.get(url, headers=my_headers, timeout=4)
if response.status_code != 200:
author_names = None
else:
response.encoding = 'utf-8'
html_text = response.text
text_list = html_text.split('n')
author_lines = [text_line for text_line in text_list if text_line.find('citation_author') != -1]
author_names = [re.sub('^.*content="', "", author_line) for author_line in author_lines]
author_names = [re.sub('"/.*', "", author_line) for author_line in author_names]
author_names = [author_name.replace(',', '') for author_name in author_names]
return author_names
except:
arxiv_authors(url)
ieee
import requests
import re
import json
my_headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def ieee_authors(url):
try:
response = requests.get(url, headers=my_headers, timeout=4)
if response.status_code != 200:
author_names = None
else:
response.encoding = 'utf-8'
html_text = response.text
text_list = html_text.split('n')
author_lines = [text_line for text_line in text_list if text_line.find('"authors"') != -1]
# assert len(author_lines) == 1
author_line = author_lines[0]
# The below is not suitable for IEEE Access
# author_line = re.sub(',"isbn".*$', '', author_line)
# author_line = re.sub('^.*"authors":', '', author_line)
author_line = re.sub('^.*document.metadata=', '', author_line)
# delete the ;
author_line = author_line[:-1]
metadata = json.loads(author_line)
author_list = metadata['authors']
author_names = []
for author in author_list:
author_names.append(author['name'])
return author_names
except:
ieee_authors(url)
springer
import requests
import re
my_headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def springer_authors(url):
try:
response = requests.get(url, headers=my_headers, timeout=4)
if response.status_code != 200:
author_names = None
else:
response.encoding = 'utf-8'
html_text = response.text
text_list = html_text.split('n')
author_lines = [text_line for text_line in text_list if text_line.find('"citation_author"') != -1]
author_names = [re.sub('^.*content="', "", author_line) for author_line in author_lines]
author_names = [re.sub('"/.*', "", author_line) for author_line in author_names]
return author_names
except:
springer_authors(url)
sciencedirect
import requests
import re
import json
my_headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def sciencedirect_authors(url):
try:
response = requests.get(url, headers=my_headers, timeout=4)
if response.status_code != 200:
author_names = None
else:
response.encoding = 'utf-8'
html_text = response.text
text_list = html_text.split('n')
author_lines = [text_line for text_line in text_list if text_line.find('application/json') != -1]
# assert len(author_lines) == 1
author_line = author_lines[0]
author_line = re.sub('^<script type="application/json".*">', '', author_line)
author_line = re.sub('</script>$', '', author_line)
json_dic = json.loads(author_line)
author_infos = json_dic['authors']['content'][0]['$$']
author_names = []
for author_info in author_infos:
if author_info['#name'] == 'author':
author_name_infos = author_info['$$']
for author_name_info in author_name_infos:
if author_name_info['#name'] == 'given-name':
first_name = author_name_info['_']
elif author_name_info['#name'] == 'surname':
last_name = author_name_info['_']
author_name = first_name + ' ' + last_name
author_names.append(author_name)
return author_names
except:
sciencedirect_authors(url)
acm
import requests
import re
my_headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def acm_authors(url):
try:
response = requests.get(url, headers=my_headers, timeout=4)
if response.status_code != 200:
author_names = None
else:
response.encoding = 'utf-8'
html_text = response.text
text_list = html_text.split('n')
author_lines = [text_line for text_line in text_list if text_line.find('rlist--inline loa truncate-list') != -1]
author_line = author_lines[0]
author_line = re.sub('^.*</b></li>', '', author_line).strip()
author_list = author_line.split('<li class="loa__item">')
# start from 1, because 0 is ""
author_names = [re.sub('^.*"author-name" title="', '', author_x) for author_x in author_list[1:]]
author_names = [re.sub('"><span class="loa__author-info".*$', '', author_x) for author_x in author_names]
return author_names
except:
acm_authors(url)
实例还是看我github里面的完整代码
第一次写这种代码有点烂,希望多跟大伙儿交流哈
最后
以上就是优美雪糕为你收集整理的统计ieee, springer, arxiv, sciencedirect, acm论文中作者信息的全部内容,希望文章能够帮你解决统计ieee, springer, arxiv, sciencedirect, acm论文中作者信息所遇到的程序开发问题。
如果觉得靠谱客网站的内容还不错,欢迎将靠谱客网站推荐给程序员好友。
本图文内容来源于网友提供,作为学习参考使用,或来自网络收集整理,版权属于原作者所有。
发表评论 取消回复