Word2Vec
데이터¶
import datasets
dataset_dir = 'data/korean-english'
한영데이터 = datasets.load_dataset('csv', data_files={
'train': f'{dataset_dir}/train.csv',
'validation': f'{dataset_dir}/validation.csv',
'test': f'{dataset_dir}/test.csv'
})
for split in 한영데이터:
print(f"{split:<12}: {len(한영데이터[split]):,}")train : 833,321
validation : 104,165
test : 104,165
import sentencepiece as spm
한국어_형태분석기 = spm.SentencePieceProcessor()
assert 한국어_형태분석기.load('ko_spm.model'), "모델 로드 실패"import pandas as pd
예문 = '대통령께서 입장하십니다.'
형태소목록 = 한국어_형태분석기.encode(예문, out_type=str)
정수시퀀스 = 한국어_형태분석기.encode(예문, out_type=int)
pd.DataFrame([{정수: 형태소 for 형태소, 정수 in zip(형태소목록, 정수시퀀스)}])Loading...
한국어문장 = 한영데이터.select_columns('ko')
한국어형태분석문장 = 한국어문장.map(
lambda x: {'tokens': [
token.replace('▁', '') for token in 한국어_형태분석기.encode(x['ko'], out_type=str)
if len(token.replace('▁', '')) > 0 # 밑줄 제거 후 한 글자 이상인 토큰만 유지
]})한국어형태분석문장 = datasets.load_from_disk('ko_tokens')
print(한국어형태분석문장.keys())
pd.DataFrame(한국어형태분석문장['train'].shuffle(seed=7).take(5))dict_keys(['train', 'validation', 'test'])
Loading...
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import gensim
word2vec = gensim.models.word2vec.Word2Vec(한국어형태분석문장['train']['tokens'], vector_size=200)Output
Fetching long content....
word2vec.save('word2vec.gensim')2025-12-17 13:40:24,795 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'word2vec.gensim', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2025-12-17T13:40:24.795608', 'gensim': '4.4.0', 'python': '3.10.18 (main, Jun 5 2025, 13:14:17) [GCC 11.2.0]', 'platform': 'Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'saving'}
2025-12-17 13:40:24,796 : INFO : not storing attribute cum_table
2025-12-17 13:40:24,817 : INFO : saved word2vec.gensim
word2vec = gensim.models.Word2Vec.load('word2vec.gensim')2025-12-17 13:40:26,042 : INFO : loading Word2Vec object from word2vec.gensim
2025-12-17 13:40:26,045 : INFO : loading wv recursively from word2vec.gensim.wv.* with mmap=None
2025-12-17 13:40:26,046 : INFO : setting ignored attribute cum_table to None
2025-12-17 13:40:26,065 : INFO : Word2Vec lifecycle event {'fname': 'word2vec.gensim', 'datetime': '2025-12-17T13:40:26.065101', 'gensim': '4.4.0', 'python': '3.10.18 (main, Jun 5 2025, 13:14:17) [GCC 11.2.0]', 'platform': 'Linux-6.6.87.2-microsoft-standard-WSL2-x86_64-with-glibc2.35', 'event': 'loaded'}
word2vec.wv.vectors.shape(5056, 200)단어 = '사랑'
assert 단어 in word2vec.wv
단어벡터 = word2vec.wv[단어]
print(type(단어벡터))
print(단어, '->', f'{단어벡터.shape}: {단어벡터[:10].round(3)} ...')<class 'numpy.ndarray'>
사랑 -> (200,): [ 0.631 -0.93 -1.12 3.418 -1.928 3.14 0.102 2.24 0.418 1.092] ...
word2vec.wv.most_similar(단어)[('마음', 0.6226913332939148),
('이야기', 0.6088430881500244),
('행복', 0.6035996675491333),
('좋아', 0.5592609643936157),
('아이들', 0.5562693476676941),
('친구', 0.5468686819076538),
('모습', 0.5412734150886536),
('그녀', 0.5346773266792297),
('남편', 0.531864583492279),
('생각', 0.519743025302887)]results = word2vec.wv.most_similar(positive='아버지', negative='남자', topn=5)
pd.DataFrame(results, columns=['단어', '유사도']).round(3)Loading...