更に前回entryの続きとして以下
from sklearn.feature_extraction.text import TfidfVectorizer from sudachipy import dictionary import csv import pandas import re import unicodedata qa_sys_src_csv = "qa_srcs_full.csv" sudachi_conf_json = "c:/Users/end0t/tmp/QA_SGST/sudachi_user_dic/sudachi.json" def main(): # TF-IDF対象のテキストをload qas = load_qa_sys_src() tokenizer_obj = dictionary.Dictionary(config_path=sudachi_conf_json).create() docs = [] for qa_src in ( qas ): org_txt = "".join([ qa_src["表題"],"。",qa_src["相談内容"] ]) # 形態素解析 tokens = tokenizer_obj.tokenize( org_txt ) doc = [] for token in tokens: if token.part_of_speech()[1] != "普通名詞": continue doc.append( token.normalized_form() ) docs.append( " ".join( doc ) ) # TF-IDF処理 vectorizer = TfidfVectorizer(smooth_idf=False) # TF-IDF処理結果の取り出し tfidf_matrix = vectorizer.fit_transform(docs) tf_idf = tfidf_matrix.toarray() # TF-IDF 行列を表示 # 対応する単語を表示 feature_names = vectorizer.get_feature_names_out() # pandasの data frame化 tf_idf_df = pandas.DataFrame(tf_idf, columns=feature_names).T # 各document毎に上位10個の特徴語を表示 for doc_no in range(100): print( qas[doc_no]["表題"], qas[doc_no]["相談内容"] ) print( tf_idf_df.sort_values(doc_no, ascending=False)[:10][doc_no] ) def load_qa_sys_src(): ret_datas = [] with open(qa_sys_src_csv, encoding="cp932") as f: reader = csv.DictReader(f) for row in reader: row["相談No"] = normalize_word( row["相談No"] ) row["表題"] = normalize_word( row["表題"] ) row["相談内容"] = normalize_word( row["相談内容"] ) row["回答内容"] = normalize_word( row["回答内容"] ) ret_datas.append(row) return ret_datas # Sudachiのユーザー辞書には文字正規化が必要 # https://zenn.dev/sorami/articles/6bdb4bf6c7f207 def normalize_word( word ): word = re.sub("[\s\n ]+","",word) word = unicodedata.normalize('NFKC', word) word = word.lower().replace(",","").replace("--","") return word if __name__ == '__main__': main()