简单NLP TF-IDF算法实现关键词文本搜索

程序员文章站 2022-07-10 08:05:10

利用tf-idf算法搜索出与关键词有关的前K文本import osfrom nltk.text import TextCollectionfrom nltk.tokenize import word_tokenizefrom collections import Counterdef find_key(key_word): file_path='B3' file_list=os.listdir(file_path) sents=[] all_txt=[] t...

利用tf-idf算法搜索出与关键词有关的前K文本

import os
from nltk.text import TextCollection
from nltk.tokenize import word_tokenize
from collections import Counter
def find_key(key_word):
    file_path='B3'
    file_list=os.listdir(file_path)
    sents=[]
    all_txt=[]
    tf_dict={}
    tf_idf_dict={}
    Z={}
    for j in key_word:
        print("\n")
        print(j)
        for i in range(len(file_list)):
            f=open('B3/'+file_list[i])
            str=f.read()
            sents.append(str)
            all_txt.append(str)
            f.close()
            sents=[word_tokenize(sent) for sent in sents] #对每个句子进行分词
            #print(sents)
            corpus=TextCollection(sents)  #构建单个txt语料库以计算tf
            #print(corpus)
            tf=corpus.tf(j,corpus)   #计算语料库中关键词的tf值
            #print(f'tf值{tf:.4f}')
            tf_dict[file_list[i]]=tf     #将每个txt得到的tf值存入字典
            sents.clear()
        all_txt=[word_tokenize(i) for i in all_txt] #对每个句子进行分词
        corpus=TextCollection(all_txt)  #构建所有txt语料库以计算idf
        idf=corpus.idf(j)     #计算语料库中关键词的idf值 
        #print(idf)
        for i in range(len(tf_dict)):
            tf_idf_dict[file_list[i]]=tf_dict[file_list[i]]*idf   #计算关键词在每个txt中的tf_idf值并存入tf_idf_dict字典
        X = tf_idf_dict
        Z = dict(Counter(X)+Counter(Z))  #键名相同键值相加  
        final_list=sorted(tf_idf_dict.items(),key=lambda item:item[1],reverse=True)#根据tf_idf值即字典中value值对字典排序，排序结果为列表嵌套元组形式
        k=input('输出tf-idf值最大的前k个文本: ')
        k=int(k)
        for i in final_list[:k]:
            print(i)
        tf_dict.clear()
        tf_idf_dict.clear()
        all_txt.clear()
    add_list=sorted(Z.items(),key=lambda item:item[1],reverse=True)
    print("\n")
    print(key_word)
    print("\n输出关键字合并的tf-idf值最大的前k个文本：")
    for i in add_list[:k]:
        print(i)
def find_article(name):
     #name = input("请输入想要查找的文件名：")
     file = open(f'B3/{name}','r')
     content = file.read()
     print(content)
     file.close()
def show_menu():
    menu = '''
操作菜单
0：退出
1：查询关键字
2：查询文章
    '''
    print(menu)
while True:
    show_menu()
    num = int(input("请输入操作编号："))
    if num == 0:
        print("已退出系统")
        break
    elif num == 1:
        print("查询关键字")
        key_word = []
        str_key_word=input('输入搜索关键词：')
        key_word=str_key_word.split(" ")
        find_key(key_word)
    elif num == 2:
        print("查询文章")
        name = input("请输入想要查找的文件名：")
        find_article(name)
    else:
        print("输入有误")

-----------------------------------------------------------------------------------------------------------------------------------------------——————————————————————————————————————————————————————————运行结果图

简单NLP TF-IDF算法实现关键词文本搜索

写的不好，很简陋，将就看看吧，没有剔除停用词比如is the be等，前面的利用NLTK包创建语料库部分应该做一下索引的比如（倒排索引）可以大大减少循环的时间，不索引的话每一次搜索关键词时要需要遍历所有文章，很愚蠢，hhhh。

本文地址：https://blog.csdn.net/Dylanqin/article/details/109643915

简单NLP TF-IDF算法实现关键词文本搜索

利用tf-idf算法搜索出与关键词有关的前K文本

python TF-IDF算法实现文本关键词提取

简单NLP TF-IDF算法实现关键词文本搜索

简单NLP TF-IDF算法实现关键词文本搜索