日韩黑丝制服一区视频播放|日韩欧美人妻丝袜视频在线观看|九九影院一级蜜桃|亚洲中文在线导航|青草草视频在线观看|婷婷五月色伊人网站|日本一区二区在线|国产AV一二三四区毛片|正在播放久草视频|亚洲色图精品一区

分享

Python簡(jiǎn)單實(shí)現(xiàn)基于VSM的余弦相似度計(jì)算

 六味感悟 2016-08-28
# -*- coding: utf-8 -*-
import time         
import re         
import os
import string
import sys
import math
''' ------------------------------------------------------- '''
#統(tǒng)計(jì)關(guān)鍵詞及個(gè)數(shù)
def CountKey(fileName, resultName):
    try:
        #計(jì)算文件行數(shù)
        lineNums = len(open(fileName,'rU').readlines())
        print u'文件行數(shù): ' + str(lineNums)
        #統(tǒng)計(jì)格式 格式<key:value> <屬性:出現(xiàn)個(gè)數(shù)>
        i = 0
        table = {}
        source = open(fileName,r)
        result = open(resultName,w)
         
        while i < lineNums:
            line = source.readline()
            line = line.rstrip('
')
            print line
             
            words = line.split( )  #空格分隔
            print str(words).decode('string_escape') #list顯示中文
             
            #字典插入與賦值
            for word in words:
                if word!= and table.has_key(word):      #如果存在次數(shù)加1
                    num = table[word]
                    table[word] = num + 1
                elif word!=:                            #否則初值為1
                    table[word] = 1
            i = i + 1
        #鍵值從大到小排序 函數(shù)原型:sorted(dic,value,reverse)
        dic = sorted(table.iteritems(), key = lambda asd:asd[1], reverse = True)
        for i in range(len(dic)):
            #print 'key=%s, value=%s' % (dic[i][0],dic[i][1])
            result.write(<+dic[i][0]+:+str(dic[i][1])+>
)
        return dic
         
    except Exception,e:   
        print 'Error:',e
    finally:
        source.close()
        result.close()
        print 'END
'
''' ------------------------------------------------------- '''
#統(tǒng)計(jì)關(guān)鍵詞及個(gè)數(shù) 并計(jì)算相似度
def MergeKeys(dic1,dic2):
    #合并關(guān)鍵詞 采用三個(gè)數(shù)組實(shí)現(xiàn)
    arrayKey = []
    for i in range(len(dic1)):
        arrayKey.append(dic1[i][0])       #向數(shù)組中添加元素
    for i in range(len(dic2)):      
        if dic2[i][0] in arrayKey:
            print 'has_key',dic2[i][0]
        else:                             #合并
            arrayKey.append(dic2[i][0])
    else:
        print '
'
     
    test = str(arrayKey).decode('string_escape')  #字符轉(zhuǎn)換
    print test
    #計(jì)算詞頻 infobox可忽略TF-IDF
    arrayNum1 = [0]*len(arrayKey)
    arrayNum2 = [0]*len(arrayKey)
     
    #賦值arrayNum1
    for i in range(len(dic1)):    
        key = dic1[i][0]
        value = dic1[i][1]
        j = 0
        while j < len(arrayKey):
            if key == arrayKey[j]:
                arrayNum1[j] = value
                break
            else:
                j = j + 1
    #賦值arrayNum2
    for i in range(len(dic2)):    
        key = dic2[i][0]
        value = dic2[i][1]
        j = 0
        while j < len(arrayKey):
            if key == arrayKey[j]:
                arrayNum2[j] = value
                break
            else:
                j = j + 1
     
    print arrayNum1
    print arrayNum2
    print len(arrayNum1),len(arrayNum2),len(arrayKey)
    #計(jì)算兩個(gè)向量的點(diǎn)積
    x = 0
    i = 0
    while i < len(arrayKey):
        x = x + arrayNum1[i] * arrayNum2[i]
        i = i + 1
    print x
    #計(jì)算兩個(gè)向量的模
    i = 0
    sq1 = 0
    while i < len(arrayKey):
        sq1 = sq1 + arrayNum1[i] * arrayNum1[i]   #pow(a,2)
        i = i + 1
    print sq1
     
    i = 0
    sq2 = 0
    while i < len(arrayKey):
        sq2 = sq2 + arrayNum2[i] * arrayNum2[i]
        i = i + 1
    print sq2
     
    result = float(x) / ( math.sqrt(sq1) * math.sqrt(sq2) )
    return result
     
''' -------------------------------------------------------
    基本步驟:
        1.分別統(tǒng)計(jì)兩個(gè)文檔的關(guān)鍵詞
        2.兩篇文章的關(guān)鍵詞合并成一個(gè)集合,相同的合并,不同的添加
        3.計(jì)算每篇文章對(duì)于這個(gè)集合的詞的詞頻 TF-IDF算法計(jì)算權(quán)重
        4.生成兩篇文章各自的詞頻向量
        5.計(jì)算兩個(gè)向量的余弦相似度,值越大表示越相似                            
    ------------------------------------------------------- '''
#主函數(shù)
def main():
    #計(jì)算文檔1-百度的關(guān)鍵詞及個(gè)數(shù)
    fileName1 = BaiduSpider.txt
    resultName1 = Result_Key_BD.txt
    dic1 = CountKey(fileName1, resultName1)
     
    #計(jì)算文檔2-互動(dòng)的關(guān)鍵詞及個(gè)數(shù)
    fileName2 = HudongSpider\001.txt
    resultName2 = HudongSpider\Result_Key_001.txt
    dic2 = CountKey(fileName2, resultName2)
    #合并兩篇文章的關(guān)鍵詞及相似度計(jì)算
    result = MergeKeys(dic1, dic2)
    print result
if __name__ == '__main__':
    main()</key:value>

    本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購買等信息,謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請(qǐng)點(diǎn)擊一鍵舉報(bào)。
    轉(zhuǎn)藏 分享 獻(xiàn)花(0

    0條評(píng)論

    發(fā)表

    請(qǐng)遵守用戶 評(píng)論公約

    類似文章 更多