TD-IDF (Term Frequency – Inverse Document Frequency) をpythonで実装 - end0tknr's kipple

scikit-learn for pythonに頼っていましたが、実装してみました

import math
import collections

def main():
    """ TF-IDFとは？
    各文書(document)にある各単語(term)が「その文書でどれくらい重要か」
    https://atmarkit.itmedia.co.jp/ait/articles/2112/23/news028.html
    """
    docs = ["I love programming in Python.",
            "Python programming is fun.",
            "I love coding in Python or in Perl.",
            "Coding is great." ]

    # 前処理: 小文字変換と単語分割
    parsed_docs = [doc.lower().split() for doc in docs]
    #print( parsed_docs )

    # TF-IDF計算
    tf_idf_values = compute_tf_idf(parsed_docs)

    # 結果表示
    for doc_idx, doc in enumerate(docs):
        print(f"\nDocument {doc_idx + 1}: {doc}")
        for term, tf_idf in tf_idf_values[doc_idx].items():
            print(f"  {term}: {tf_idf:.4f}")

def compute_tf_idf(docs):
    """ IF-IDF = TF × IDF """
    tf_idf_dicts = []
    idf_dict = compute_idf(docs)
    for document in docs:
        tf_dict = compute_tf(document)  # TF
        tf_idf_dict = {term: tf_dict[term] * idf_dict[term] for term in tf_dict}
        tf_idf_dicts.append(tf_idf_dict)
    return tf_idf_dicts

def compute_idf(docs):
    """Inverse Document Frequency(逆文書頻度):
    「ある単語を含む文書数」が「全文書中」で「どれくらい少ないか」(レア度)
    IDF = log( 文書数 / ある単語を含む文書数)
    """
    idf_dict  = {}
    all_terms = set(term for document in docs for term in document)
    # print( all_terms )
    
    for term in all_terms:
        containing_docs = sum(1 for document in docs if term in document)
        # スムージング付 https://ja.wikipedia.org/wiki/Tf-idf
        idf_dict[term] = math.log(len(docs) / (1 + containing_docs)) + 1
    return idf_dict

def compute_tf( doc ):
    """Term Frequency(単語頻度):
    「ある文書」での「ある単語出現頻度」
    TF = ある単語の出現数 / 文書での単語数
    """
    tf_dict = collections.Counter(doc)  # ある単語の出現数
    total_terms = len(doc)              # 文書での単語数
    for term in tf_dict:
        tf_dict[term] = tf_dict[term] / total_terms
    return tf_dict

if __name__ == '__main__':
    main()

end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

TD-IDF (Term Frequency – Inverse Document Frequency) をpythonで実装