# -*- coding: utf- 8 -*-
import time
import re
import os
import string
import sys
import math
'' ' ------------------------------------------------------- ' ''
#統(tǒng)計(jì)關(guān)鍵詞及個(gè)數(shù)
def CountKey(fileName, resultName):
try :
#計(jì)算文件行數(shù)
lineNums = len(open(fileName, 'rU' ).readlines())
print u '文件行數(shù): ' + str(lineNums)
#統(tǒng)計(jì)格式 格式<key:value> <屬性:出現(xiàn)個(gè)數(shù)>
i = 0
table = {}
source = open(fileName,r)
result = open(resultName,w)
while i < lineNums:
line = source.readline()
line = line.rstrip('
')
print line
words = line.split( ) #空格分隔
print str(words).decode( 'string_escape' ) #list顯示中文
#字典插入與賦值
for word in words:
if word!= and table.has_key(word): #如果存在次數(shù)加 1
num = table[word]
table[word] = num + 1
elif word!=: #否則初值為 1
table[word] = 1
i = i + 1
#鍵值從大到小排序 函數(shù)原型:sorted(dic,value,reverse)
dic = sorted(table.iteritems(), key = lambda asd:asd[ 1 ], reverse = True)
for i in range(len(dic)):
#print 'key=%s, value=%s' % (dic[i][ 0 ],dic[i][ 1 ])
result.write(<+dic[i][ 0 ]+:+str(dic[i][ 1 ])+>
)
return dic
except Exception,e:
print 'Error:' ,e
finally :
source.close()
result.close()
print 'END
'
'' ' ------------------------------------------------------- ' ''
#統(tǒng)計(jì)關(guān)鍵詞及個(gè)數(shù) 并計(jì)算相似度
def MergeKeys(dic1,dic2):
#合并關(guān)鍵詞 采用三個(gè)數(shù)組實(shí)現(xiàn)
arrayKey = []
for i in range(len(dic1)):
arrayKey.append(dic1[i][ 0 ]) #向數(shù)組中添加元素
for i in range(len(dic2)):
if dic2[i][ 0 ] in arrayKey:
print 'has_key' ,dic2[i][ 0 ]
else : #合并
arrayKey.append(dic2[i][ 0 ])
else :
print '
'
test = str(arrayKey).decode( 'string_escape' ) #字符轉(zhuǎn)換
print test
#計(jì)算詞頻 infobox可忽略TF-IDF
arrayNum1 = [ 0 ]*len(arrayKey)
arrayNum2 = [ 0 ]*len(arrayKey)
#賦值arrayNum1
for i in range(len(dic1)):
key = dic1[i][ 0 ]
value = dic1[i][ 1 ]
j = 0
while j < len(arrayKey):
if key == arrayKey[j]:
arrayNum1[j] = value
break
else :
j = j + 1
#賦值arrayNum2
for i in range(len(dic2)):
key = dic2[i][ 0 ]
value = dic2[i][ 1 ]
j = 0
while j < len(arrayKey):
if key == arrayKey[j]:
arrayNum2[j] = value
break
else :
j = j + 1
print arrayNum1
print arrayNum2
print len(arrayNum1),len(arrayNum2),len(arrayKey)
#計(jì)算兩個(gè)向量的點(diǎn)積
x = 0
i = 0
while i < len(arrayKey):
x = x + arrayNum1[i] * arrayNum2[i]
i = i + 1
print x
#計(jì)算兩個(gè)向量的模
i = 0
sq1 = 0
while i < len(arrayKey):
sq1 = sq1 + arrayNum1[i] * arrayNum1[i] #pow(a, 2 )
i = i + 1
print sq1
i = 0
sq2 = 0
while i < len(arrayKey):
sq2 = sq2 + arrayNum2[i] * arrayNum2[i]
i = i + 1
print sq2
result = float (x) / ( math.sqrt(sq1) * math.sqrt(sq2) )
return result
'' ' -------------------------------------------------------
基本步驟:
1 .分別統(tǒng)計(jì)兩個(gè)文檔的關(guān)鍵詞
2 .兩篇文章的關(guān)鍵詞合并成一個(gè)集合,相同的合并,不同的添加
3 .計(jì)算每篇文章對(duì)于這個(gè)集合的詞的詞頻 TF-IDF算法計(jì)算權(quán)重
4 .生成兩篇文章各自的詞頻向量
5 .計(jì)算兩個(gè)向量的余弦相似度,值越大表示越相似
------------------------------------------------------- '' '
#主函數(shù)
def main():
#計(jì)算文檔 1 -百度的關(guān)鍵詞及個(gè)數(shù)
fileName1 = BaiduSpider.txt
resultName1 = Result_Key_BD.txt
dic1 = CountKey(fileName1, resultName1)
#計(jì)算文檔 2 -互動(dòng)的關(guān)鍵詞及個(gè)數(shù)
fileName2 = HudongSpider\ 001 .txt
resultName2 = HudongSpider\Result_Key_001.txt
dic2 = CountKey(fileName2, resultName2)
#合并兩篇文章的關(guān)鍵詞及相似度計(jì)算
result = MergeKeys(dic1, dic2)
print result
if __name__ == '__main__' :
main()</key:value>
|