end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

sklearn.feature_extraction.text.TfidfVectorizer for python によるTF-IDF特徴語算出

更に前回entryの続きとして以下

from sklearn.feature_extraction.text import TfidfVectorizer
from sudachipy import dictionary
import csv
import pandas
import re
import unicodedata

qa_sys_src_csv    = "qa_srcs_full.csv"
sudachi_conf_json = "c:/Users/end0t/tmp/QA_SGST/sudachi_user_dic/sudachi.json"

def main():
    # TF-IDF対象のテキストをload
    qas = load_qa_sys_src()

    tokenizer_obj = dictionary.Dictionary(config_path=sudachi_conf_json).create()

    docs = []
    
    for qa_src in ( qas ):
        org_txt = "".join([
            qa_src["表題"],"。",qa_src["相談内容"]
        ])
        # 形態素解析
        tokens = tokenizer_obj.tokenize( org_txt )

        doc = []
        for token in tokens:
            if token.part_of_speech()[1] != "普通名詞":
                continue
            doc.append( token.normalized_form() )
        docs.append( " ".join( doc ) )

    # TF-IDF処理
    vectorizer = TfidfVectorizer(smooth_idf=False)
    # TF-IDF処理結果の取り出し
    tfidf_matrix = vectorizer.fit_transform(docs)
    tf_idf = tfidf_matrix.toarray() # TF-IDF 行列を表示

    # 対応する単語を表示
    feature_names = vectorizer.get_feature_names_out()
    # pandasの data frame化
    tf_idf_df = pandas.DataFrame(tf_idf, columns=feature_names).T

    # 各document毎に上位10個の特徴語を表示
    for doc_no in range(100):
        print( qas[doc_no]["表題"], qas[doc_no]["相談内容"] )
        print( tf_idf_df.sort_values(doc_no, ascending=False)[:10][doc_no] )


def load_qa_sys_src():
    ret_datas = []
    with open(qa_sys_src_csv, encoding="cp932") as f:
        reader = csv.DictReader(f)
        for row in reader:
            row["相談No"]   = normalize_word( row["相談No"] )
            row["表題"]     = normalize_word( row["表題"] )
            row["相談内容"] = normalize_word( row["相談内容"] )
            row["回答内容"] = normalize_word( row["回答内容"] )
            ret_datas.append(row)
    return ret_datas


# Sudachiのユーザー辞書には文字正規化が必要
# https://zenn.dev/sorami/articles/6bdb4bf6c7f207
def normalize_word( word ):
    word = re.sub("[\s\n ]+","",word)
    word = unicodedata.normalize('NFKC', word)
    word = word.lower().replace(",","").replace("--","")
    return word
    
if __name__ == '__main__':
    main()