Arxiv word frequency analysis

1 minute read

Published:


arxiv上Astro2020 white paper的相关论文进行词频统计

检索文章并保存id,title,abstract等信息

from datetime import datetime
import os
import arxiv#query num
now = datetime.now() # 获取当前datetime
print(now)
papersastro2020=arxiv.query(query="all:astro2020",max_results=1000)
astro2020filename='astro2020whitepaperuptodate%d-%d-%d.txt'%(now.year,now.month,now.day)
title='astro2020whitepaperuptodatetitle%d-%d-%d.txt'%(now.year,now.month,now.day)
summary='astro2020whitepaperuptodatesummary%d-%d-%d.txt'%(now.year,now.month,now.day)

if os.path.isfile(astro2020filename):
    os.remove(astro2020filename)
if os.path.isfile(title):
    os.remove(title)
if os.path.isfile(summary):
    os.remove(summary)

for i in papersastro2020:
    with open(astro2020filename,'a') as f:
        f.write(i['id']+'\n'+i['title']+'\n'+i['summary']+'\n'+'\n')
    with open(title,'a') as f:
        f.write(i['title']+'\n')
    with open(summary,'a') as f:
        f.write(i['summary']+'\n')

分词

import jieba
from collections import Counter
with open(title) as f:
    s = f.read()
word_list = list(jieba.cut(s.lower()))
print('分词总数:', len(word_list))
#print('示例:', word_list[:20])
words_count = Counter(word_list)
most_words = words_count.most_common(100)
#print(most_words)

from nltk.corpus import stopwords
from nltk.corpus import wordnet

stops = [word for word in stopwords.words('english')]
stops += ["=", "->"," ","\n","-",":",",","a",'x','(',')',"'","\"",'?','+',"$"]
stops +=['\\','.','/',';','~','','_']
stops +=['white','paper','astro2020']
#[!"#$&()*+,-./:;<=>?@[\\]^_{|}·~‘’]

words = filter(lambda w: w not in stops,
word_list)
top_twenty_title = Counter(words).most_common(n=20)
for i in top_twenty_title:
    if wordnet.synsets(i[0]) and i[0].isalpha():
        print(i)

Plot bar of title statistics

def bar_chart(items,title,filename):
    """Make a bar chart showing the count associated with each key

    `items` is a list of (key, count) pairs.
    """
    width = 0.5
    ind = np.arange(len(items))
    fig, ax = plt.subplots(figsize=(8,8))
    rects1 = ax.bar(ind, list(zip(*items))[1], width, color='r')
    ax.set_xticks(ind+width)
    ax.set_xticklabels(list(zip(*items))[0])
    fig.autofmt_xdate()
    plt.title(title)
    plt.savefig(filename)
    plt.show()

bar_chart(top_twenty_title,title='top_twenty word in title of Astro2020 White Paper',filename='top_20_title.png')

欢迎关注微信公众号:曜灵集