Arxiv word frequency analysis
Published:
arxiv上Astro2020 white paper的相关论文进行词频统计
检索文章并保存id,title,abstract等信息
from datetime import datetime
import os
import arxiv#query num
now = datetime.now() # 获取当前datetime
print(now)
papersastro2020=arxiv.query(query="all:astro2020",max_results=1000)
astro2020filename='astro2020whitepaperuptodate%d-%d-%d.txt'%(now.year,now.month,now.day)
title='astro2020whitepaperuptodatetitle%d-%d-%d.txt'%(now.year,now.month,now.day)
summary='astro2020whitepaperuptodatesummary%d-%d-%d.txt'%(now.year,now.month,now.day)
if os.path.isfile(astro2020filename):
os.remove(astro2020filename)
if os.path.isfile(title):
os.remove(title)
if os.path.isfile(summary):
os.remove(summary)
for i in papersastro2020:
with open(astro2020filename,'a') as f:
f.write(i['id']+'\n'+i['title']+'\n'+i['summary']+'\n'+'\n')
with open(title,'a') as f:
f.write(i['title']+'\n')
with open(summary,'a') as f:
f.write(i['summary']+'\n')
分词
import jieba
from collections import Counter
with open(title) as f:
s = f.read()
word_list = list(jieba.cut(s.lower()))
print('分词总数:', len(word_list))
#print('示例:', word_list[:20])
words_count = Counter(word_list)
most_words = words_count.most_common(100)
#print(most_words)
from nltk.corpus import stopwords
from nltk.corpus import wordnet
stops = [word for word in stopwords.words('english')]
stops += ["=", "->"," ","\n","-",":",",","a",'x','(',')',"'","\"",'?','+',"$"]
stops +=['\\','.','/',';','~','','_']
stops +=['white','paper','astro2020']
#[!"#$&()*+,-./:;<=>?@[\\]^_{|}·~‘’]
words = filter(lambda w: w not in stops,
word_list)
top_twenty_title = Counter(words).most_common(n=20)
for i in top_twenty_title:
if wordnet.synsets(i[0]) and i[0].isalpha():
print(i)
Plot bar of title statistics
def bar_chart(items,title,filename):
"""Make a bar chart showing the count associated with each key
`items` is a list of (key, count) pairs.
"""
width = 0.5
ind = np.arange(len(items))
fig, ax = plt.subplots(figsize=(8,8))
rects1 = ax.bar(ind, list(zip(*items))[1], width, color='r')
ax.set_xticks(ind+width)
ax.set_xticklabels(list(zip(*items))[0])
fig.autofmt_xdate()
plt.title(title)
plt.savefig(filename)
plt.show()
bar_chart(top_twenty_title,title='top_twenty word in title of Astro2020 White Paper',filename='top_20_title.png')
欢迎关注微信公众号:曜灵集