scikit-learn for pythonに頼っていましたが、実装してみました
import math import collections def main(): """ TF-IDFとは? 各文書(document)にある各単語(term)が「その文書でどれくらい重要か」 https://atmarkit.itmedia.co.jp/ait/articles/2112/23/news028.html """ docs = ["I love programming in Python.", "Python programming is fun.", "I love coding in Python or in Perl.", "Coding is great." ] # 前処理: 小文字変換と単語分割 parsed_docs = [doc.lower().split() for doc in docs] #print( parsed_docs ) # TF-IDF計算 tf_idf_values = compute_tf_idf(parsed_docs) # 結果表示 for doc_idx, doc in enumerate(docs): print(f"\nDocument {doc_idx + 1}: {doc}") for term, tf_idf in tf_idf_values[doc_idx].items(): print(f" {term}: {tf_idf:.4f}") def compute_tf_idf(docs): """ IF-IDF = TF × IDF """ tf_idf_dicts = [] idf_dict = compute_idf(docs) for document in docs: tf_dict = compute_tf(document) # TF tf_idf_dict = {term: tf_dict[term] * idf_dict[term] for term in tf_dict} tf_idf_dicts.append(tf_idf_dict) return tf_idf_dicts def compute_idf(docs): """Inverse Document Frequency(逆文書頻度): 「ある単語を含む文書数」が「全文書中」で「どれくらい少ないか」(レア度) IDF = log( 文書数 / ある単語を含む文書数) """ idf_dict = {} all_terms = set(term for document in docs for term in document) # print( all_terms ) for term in all_terms: containing_docs = sum(1 for document in docs if term in document) # スムージング付 https://ja.wikipedia.org/wiki/Tf-idf idf_dict[term] = math.log(len(docs) / (1 + containing_docs)) + 1 return idf_dict def compute_tf( doc ): """Term Frequency(単語頻度): 「ある文書」での「ある単語出現頻度」 TF = ある単語の出現数 / 文書での単語数 """ tf_dict = collections.Counter(doc) # ある単語の出現数 total_terms = len(doc) # 文書での単語数 for term in tf_dict: tf_dict[term] = tf_dict[term] / total_terms return tf_dict if __name__ == '__main__': main()