底下程式碼中,先將 train_ts.csv 進行斷詞,再開始向量化,最後偵測跟 “美國” 相近的結果
import codecs import time import jieba import pandas as pd import re from gensim.models import Word2Vec display=pd.options.display display.max_columns=None display.max_rows=None display.width=None display.max_colwidth=None def preprocess(text): text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip() text=text.replace(" ","")
return text print("斷詞處理中...............") TEXT_CLEANING_RE = "[0-9a-zA-Z,:.?!\"\+\-*/_='()\[\]|<>$()[],|、《》!?”%【】“ .…❤:]" jieba.set_dictionary('dict.txt') #載入停用詞 with open('stops.txt', 'r', encoding='utf-8') as file: stops = file.read().split('\n') stops=set(stops) with open('train_ts.csv','r', encoding='utf-8) as file:
lines=file.readlines()
vocabularies=[] for line in lines: line=preprocess(line) terms = [t for t in jieba.cut(line, cut_all=True) if t not in stops] vocabularies.append(terms) w2v_model = Word2Vec( window=7, min_count=10, workers=8 ) w2v_model.build_vocab(vocabularies) words = list(w2v_model.wv.key_to_index.keys()) vocab_size = len(words) print("Vocab size", vocab_size)
print("開始向量化....") t1=time.time() w2v_model.train(vocabularies, total_examples=len(vocabularies), epochs=32) t2=time.time() print(f'向量化時間 : {t2-t1}秒') w2v_model.save("w2v_model_chinese") 結果: 斷詞處理中............... Building prefix dict from E:\python\w2v\dict.txt ... Loading model from cache C:\Users\mahal\AppData\Local\Temp\jieba.u3f38139618254a46357e04c6afbde5be.cache Loading model cost 0.274 seconds. Prefix dict has been built successfully. 向量化處理中............... Vocab size 30530 向量化時間 : 48.616782903671265秒
載入向量模型
因為 train_ts.csv 有 320,767 筆資料,向量化模型要一段時間,還好上面程式碼中已儲存成 “w2v_model_chinese” 檔案,所以下次要使用,可以直接載入進行偵測。
無法訓練的人,可由本站下載 w2v_model_chinese
from gensim.models import Word2Vec model=Word2Vec.load("w2v_model_chinese") while True: voc=input("請輸入要查詢的字詞 : ") if voc=='quit':break try: rs=model.wv.most_similar(voc) for r in rs: print(r) except: print("不在字詞中") 結果 ('日本軍', 0.6112362146377563) ('軍國', 0.579851508140564) ('軍國主義', 0.558709979057312) ('本天', 0.5249565839767456) ('日本海', 0.5150989294052124) ('底牌', 0.4993012547492981) ('來日', 0.49267053604125977) ('日本人', 0.48523956537246704) ('導彈', 0.4833442270755768) ('日本國', 0.4820500314235687)