挖掘DBLP作者合作关系,FP-Growth算法实践(4):挖掘每个会议的
发布时间:2021-05-25 23:32:00 所属栏目:大数据 来源:网络整理
导读:在只有【论文标题、发布时间、作者、会议名称】这四种信息的情况下,首先提取出所有这四种信息: 代码产生的结果如下,数据结构类似于headerTable,看结果就知道了,不再介绍: authorDict={} #{authorName: total(frequence,startYear,endYear),{eachConf
|
在只有【论文标题、发布时间、作者、会议名称】这四种信息的情况下,首先提取出所有这四种信息: 代码产生的结果如下,数据结构类似于headerTable,看结果就知道了,不再介绍: authorDict={} #{authorName: total(frequence,startYear,endYear),{eachConf(frequence,endYear)}} authorDict,conferenceDict=CountAuthorAndConferenceFrequence(tranDB)
print len(authorDict) #31886
print authorDict["Ying Wu"]
#[51,2000,2015,{'KDD': [1,2007,2007],'SDM': [1,2005,2005],'CVPR': [49,2015]}]
print conferenceDict
#{'CVPR': 7722,'PAKDD': 1760,'ICML': 1111,'KDD': 2360,'COLT': 736,'ICDM': 3873,'DMKD': 47,'SIGIR': 2772,'ECML/PKDD': 1036,'SDM': 1156,'WSDM': 618,'NIPS': 3852}
代码为: def CountAuthorAndConferenceFrequence(tranDB):
authorDict={} #{authorName: total(frequence,endYear)}}
conferenceDict={} #{conference: count}
for i,(conf,year,authorList) in enumerate(tranDB):
print "trans",i,"=="*20
if conf is np.nan or year is np.nan or authorList is np.nan:
continue #for tranDB[2426,:]
if conferenceDict.has_key(conf):
conferenceDict[conf]+=1
else:
conferenceDict[conf]=1
for author in authorList.split("|"):
if authorDict.has_key(author):
#total(frequence,endYear)
authorDict[author][0]+=1
if year<authorDict[author][1]:
authorDict[author][1]=year
elif year>authorDict[author][2]:
authorDict[author][2]=year
#eachConf(frequence,endYear)
if authorDict[author][3].has_key(conf):
authorDict[author][3][conf][0]+=1
if year<authorDict[author][3][conf][1]:
authorDict[author][3][conf][1]=year
elif year>authorDict[author][3][conf][2]:
authorDict[author][3][conf][2]=year
else:
authorDict[author][3][conf]=[1,year]
else:
authorDict[author]=[1,{}]
authorDict[author][3][conf]=[1,year]
return authorDict,conferenceDict
提取核心研究者,就是阈值控制,没技术含量: '''
authorDict={} #{authorName: total(frequence,endYear)}}
conferenceDict={} #{conference: count}
'''
def FindCoreResearcher(authorDict,conferenceDict,coreSupport=0.05):
wf=open("CoreResearcher.txt","w")
allConferenceTotalCount=0
for conf,count in conferenceDict.items():
allConferenceTotalCount+=count
coreSup=count*coreSupport
if coreSup<2: #at least 2,or how can we say you are a core researcher?
coreSup=2
elif coreSup>20: #some conference find to few core researchers
coreSup=coreSup/2
elif coreSup>10: #some conference find to few core researchers
coreSup=10 #but is we choose coreSup/2,will to many,so we select this threshold
print "conf:",conf,",total paper count:",count,"##"*20,"coreSup=",coreSup
wf.write("conf:"+conf+",total paper count:"+str(count)+"##"*20+"coreSup="+str(coreSup)+"n")
for author in authorDict.keys():
if authorDict[author][3].has_key(conf) and authorDict[author][3][conf][0]>=coreSup:
print "%s is a core researcher with support %s" % (author,authorDict[author][3][conf][0])
wf.write(author+" is a core researcher with support "+str(authorDict[author][3][conf][0])+"n")
wf.write("t His/Her active time is==>["+str(authorDict[author][3][conf][1])+"-"+str(authorDict[author][3][conf][2])+"]n")
paperCount_authorCount_Dict={}
coreSup=allConferenceTotalCount*coreSupport
if coreSup>len(conferenceDict)*15:
coreSup=len(conferenceDict)*15/4 #at least a paper one year in (at least 1/4 of all the conferences)
print "all conference,allConferenceTotalCount,"the overall coreSupport=",coreSup
wf.write("all conference,total paper count:"+str(allConferenceTotalCount)+"##"*20+"the overall coreSupport="+str(coreSup)+"n")
for author in authorDict.keys():
if authorDict[author][0]>=coreSup:
print "%s is a core researcher with support %s" % (author,authorDict[author][0])
wf.write(author+" is a core researcher with support "+str(authorDict[author][0])+"n")
wf.write("t His/Her active time is==>["+str(authorDict[author][1])+"-"+str(authorDict[author][2])+"]n")
#count how many authors have published "authorDict[author][0]" papers
#this is for the frequent pattern minSupport
if paperCount_authorCount_Dict.has_key(authorDict[author][0]):
paperCount_authorCount_Dict[authorDict[author][0]]+=1
else:
paperCount_authorCount_Dict[authorDict[author][0]]=1
wf.close()
return paperCount_authorCount_Dict
(编辑:黄山站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |
站长推荐


