end0tknr's kipple - web写経開発

太宰府天満宮の狛犬って、妙にカワイイ

sudachipy for python (miniconda for win)による sudachiユーザ辞書作成

以下の通りです

sudachipy.exe コマンドを呼ぶのではなく、 https://github.com/WorksApplications/SudachiPy/blob/develop/sudachipy/dictionarylib/userdictionarybuilder.py にある UserDictionaryBuilder クラスを 内部的に呼びたかったのですが、userdictionarybuilder.py は pip install sudachipy によるインストールの対象外のようでしたので、 以下のようにしています

import csv
import datetime
import os
import re
import sys
import subprocess
import sudachipy
import unicodedata

use_dic_dir  = os.path.dirname(os.path.abspath(__file__))

dic_src_path = use_dic_dir + "/user.dic.src.txt"
dic_csv_path = use_dic_dir + "/user.dic.csv"
user_dic_path= use_dic_dir + "/user.dic"
sudachi_cmd  = "C:/Users/end0t/miniconda3/Scripts/sudachipy.exe"
sys_dic_path = "C:/Users/end0t/miniconda3/Lib/site-packages/sudachidict_core/resources/system.dic"

def main():

    dic_words = load_dic_src( dic_src_path )
    dic_csv_path = save_dic_csv( dic_words )
    
    # 古いユーザ辞書fileのbackup
    global user_dic_path
    if os.path.exists( user_dic_path ):
        bakup_path = user_dic_path + "." + datetime.datetime.now().strftime('%Y%m%d')
        os.rename(user_dic_path, bakup_path)
        
    user_dic_path = make_user_dic( dic_csv_path )
    print( user_dic_path )

# sudachipy.exe ubuild コマンドによるユーザ辞書の作成
def make_user_dic( dic_csv_path ):

    cmd_line = "{} ubuild -s {} -o {} {}".format(
        sudachi_cmd, sys_dic_path, user_dic_path, dic_csv_path )
    print( cmd_line )
    proc = subprocess.Popen(
        cmd_line,
        shell  = True,
        stdin  = subprocess.PIPE,
        stdout = subprocess.PIPE,
        stderr = subprocess.PIPE)
    stdout, stderr = proc.communicate()
    return user_dic_path
    
# ユーザ辞書用csvの作成  https://qiita.com/sakamoto_mi/items/c1787973dd1a591c9957
# https://github.com/WorksApplications/Sudachi/blob/develop/docs/user_dict.md
def save_dic_csv( dic_words ):

    dic_csv_tmpl  = \
        "{word},4789,4789,5000,{word},名詞,普通名詞,一般,*,*,*,*,{caption},*,*,*,*,*"

    with open(dic_csv_path, mode="w",encoding='utf-8') as f:
        for word, caption in dic_words.items():
            csv_line = dic_csv_tmpl.format( word=word, caption=caption )
            f.write( csv_line +"\n" )
            
    return dic_csv_path

# ユーザ辞書用csvの元となるtsvのload    縦軸:見出し語、横軸:類似語
def load_dic_src( dic_src_path ):
    ret_datas = {}
    
    with open(dic_src_path, encoding='utf-8') as f:
        for tsv_line in f:
            words = tsv_line.strip().split("\t")
            caption = None
            for i, word in enumerate(words):
                word = normalize_word( word )
                
                if word in ret_datas:
                    print( f"WARN duplicate word exist : {word}",file=sys.stderr )
                    continue
                if i == 0:
                    caption = word
                if not caption:
                    continue
                ret_datas[word] = caption
        return ret_datas

# Sudachiのユーザー辞書には文字正規化が必要
# https://zenn.dev/sorami/articles/6bdb4bf6c7f207
def normalize_word( word ):
    word = re.sub("[\s\n ]+","",word)
    word = unicodedata.normalize('NFKC', word)
    word = word.lower().replace(",","")
    return word

if __name__ == '__main__':
    main()