Byte-Pair Encoding (BPE)
형태소 분리 알고리즘
정규화 (Normalization)
Pre-tokenization
글자 단위 분리
글자 그룹 병합
corpus = [
"This is the Hugging Face Course.",
"This chapter is about tokenization.",
"This section shows several tokenizer algorithms.",
"Hopefully, you will be able to understand how they are trained and generate tokens.",
]from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")Loading...
Loading...
Loading...
Loading...
단어 도수 집계
text = corpus[0]
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)[('This', (0, 4)),
('Ġis', (4, 7)),
('Ġthe', (7, 11)),
('ĠHugging', (11, 19)),
('ĠFace', (19, 24)),
('ĠCourse', (24, 31)),
('.', (31, 32))]text[0:4]'This'text[4:7]' is'from collections import defaultdict
word_freq = defaultdict(int)
for text in corpus:
words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
new_words = [word for word, offset in words_with_offsets]
for word in new_words:
word_freq[word] += 1print(word_freq)defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})
글자 집합
alphabet = []
for word in word_freq.keys():
for char in word:
if char not in alphabet:
alphabet.append(char)
alphabet.sort()print(alphabet)[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']
vocab = ["<|endoftext|>"] + alphabet.copy()splits = {word: [char for char in word] for word in word_freq.keys()}splits{'This': ['T', 'h', 'i', 's'],
'Ġis': ['Ġ', 'i', 's'],
'Ġthe': ['Ġ', 't', 'h', 'e'],
'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],
'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'],
'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],
'.': ['.'],
'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],
'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'],
'Ġtokenization': ['Ġ',
't',
'o',
'k',
'e',
'n',
'i',
'z',
'a',
't',
'i',
'o',
'n'],
'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],
'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'],
'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],
'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],
'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],
'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],
',': [','],
'Ġyou': ['Ġ', 'y', 'o', 'u'],
'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'],
'Ġbe': ['Ġ', 'b', 'e'],
'Ġable': ['Ġ', 'a', 'b', 'l', 'e'],
'Ġto': ['Ġ', 't', 'o'],
'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'],
'Ġhow': ['Ġ', 'h', 'o', 'w'],
'Ġthey': ['Ġ', 't', 'h', 'e', 'y'],
'Ġare': ['Ġ', 'a', 'r', 'e'],
'Ġtrained': ['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd'],
'Ġand': ['Ġ', 'a', 'n', 'd'],
'Ġgenerate': ['Ġ', 'g', 'e', 'n', 'e', 'r', 'a', 't', 'e'],
'Ġtokens': ['Ġ', 't', 'o', 'k', 'e', 'n', 's']}def compute_pair_freqs(splits):
pair_freqs = defaultdict(int)
for word, freq in word_freq.items():
chars = splits[word]
if len(chars) == 1:
continue
for i in range(len(chars) - 1):
pair = (chars[i], chars[i + 1])
pair_freqs[pair] += freq
return pair_freqs
pair_freqs = compute_pair_freqs(splits)for i, key in enumerate(pair_freqs.keys()):
print(key, pair_freqs[key])
if i == 5:
break('T', 'h') 3
('h', 'i') 3
('i', 's') 5
('Ġ', 'i') 2
('Ġ', 't') 7
('t', 'h') 3
best_pair = ""
max_freq = None
for pair, freq in pair_freqs.items():
if max_freq is None or freq > max_freq:
max_freq = freq
best_pair = pair
print(best_pair, max_freq)('Ġ', 't') 7
merges = {best_pair: ''.join(best_pair)}
merges{('Ġ', 't'): 'Ġt'}merged_pattern = ''.join(best_pair)
vocab.append(merged_pattern)def merge_pair(a, b, splits):
for word in word_freq:
split = splits[word]
if len(split) == 1:
continue
i = 0
while i < len(split) - 1:
if split[i] == a and split[i + 1] == b:
split = split[:i] + [a + b] + split[i + 2:]
else:
i += 1
splits[word] = split
return splitssplits['Ġtrained']['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd']splits = merge_pair(best_pair[0], best_pair[1], splits)
print(splits['Ġtrained'])['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']
vocab_size = 50
while len(vocab) < vocab_size:
pair_freqs = compute_pair_freqs(splits)
best_pair = ""
max_freq = None
for pair, freq in pair_freqs.items():
if max_freq is None or freq > max_freq:
max_freq = freq
best_pair = pair
splits = merge_pair(*best_pair, splits)
merges[best_pair] = ''.join(best_pair)
vocab.append(merges[best_pair])print(merges){('Ġ', 't'): 'Ġt', ('i', 's'): 'is', ('e', 'r'): 'er', ('Ġ', 'a'): 'Ġa', ('Ġt', 'o'): 'Ġto', ('e', 'n'): 'en', ('T', 'h'): 'Th', ('Th', 'is'): 'This', ('o', 'u'): 'ou', ('s', 'e'): 'se', ('Ġto', 'k'): 'Ġtok', ('Ġtok', 'en'): 'Ġtoken', ('n', 'd'): 'nd', ('Ġ', 'is'): 'Ġis', ('Ġt', 'h'): 'Ġth', ('Ġth', 'e'): 'Ġthe', ('i', 'n'): 'in', ('Ġa', 'b'): 'Ġab', ('Ġtoken', 'i'): 'Ġtokeni'}
print(vocab)['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ', 'Ġt', 'is', 'er', 'Ġa', 'Ġto', 'en', 'Th', 'This', 'ou', 'se', 'Ġtok', 'Ġtoken', 'nd', 'Ġis', 'Ġth', 'Ġthe', 'in', 'Ġab', 'Ġtokeni']