Byte-Pair Encoding (BPE) - 딥러닝 언어 모델

형태소 분리 알고리즘

정규화 (Normalization)
Pre-tokenization
글자 단위 분리
글자 그룹 병합

corpus = [
    "This is the Hugging Face Course.",
    "This chapter is about tokenization.",
    "This section shows several tokenizer algorithms.",
    "Hopefully, you will be able to understand how they are trained and generate tokens.",
]

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

단어 도수 집계

text = corpus[0]
tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)

[('This', (0, 4)),
 ('Ġis', (4, 7)),
 ('Ġthe', (7, 11)),
 ('ĠHugging', (11, 19)),
 ('ĠFace', (19, 24)),
 ('ĠCourse', (24, 31)),
 ('.', (31, 32))]

text[0:4]

'This'

text[4:7]

' is'

from collections import defaultdict

word_freq = defaultdict(int)

for text in corpus:
    words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)
    new_words = [word for word, offset in words_with_offsets]
    for word in new_words:
        word_freq[word] += 1

print(word_freq)

defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})

글자 집합

alphabet = []

for word in word_freq.keys():
    for char in word:
        if char not in alphabet:
            alphabet.append(char)

alphabet.sort()

print(alphabet)

[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']

vocab = ["<|endoftext|>"] + alphabet.copy()

splits = {word: [char for char in word] for word in word_freq.keys()}

splits

{'This': ['T', 'h', 'i', 's'],
 'Ġis': ['Ġ', 'i', 's'],
 'Ġthe': ['Ġ', 't', 'h', 'e'],
 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],
 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'],
 'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],
 '.': ['.'],
 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],
 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'],
 'Ġtokenization': ['Ġ',
  't',
  'o',
  'k',
  'e',
  'n',
  'i',
  'z',
  'a',
  't',
  'i',
  'o',
  'n'],
 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],
 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'],
 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],
 'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],
 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],
 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],
 ',': [','],
 'Ġyou': ['Ġ', 'y', 'o', 'u'],
 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'],
 'Ġbe': ['Ġ', 'b', 'e'],
 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'],
 'Ġto': ['Ġ', 't', 'o'],
 'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'],
 'Ġhow': ['Ġ', 'h', 'o', 'w'],
 'Ġthey': ['Ġ', 't', 'h', 'e', 'y'],
 'Ġare': ['Ġ', 'a', 'r', 'e'],
 'Ġtrained': ['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd'],
 'Ġand': ['Ġ', 'a', 'n', 'd'],
 'Ġgenerate': ['Ġ', 'g', 'e', 'n', 'e', 'r', 'a', 't', 'e'],
 'Ġtokens': ['Ġ', 't', 'o', 'k', 'e', 'n', 's']}

def compute_pair_freqs(splits):
    pair_freqs = defaultdict(int)
    for word, freq in word_freq.items():
        chars = splits[word]
        if len(chars) == 1:
            continue
        for i in range(len(chars) - 1):
            pair = (chars[i], chars[i + 1])
            pair_freqs[pair] += freq
    return pair_freqs
pair_freqs = compute_pair_freqs(splits)

for i, key in enumerate(pair_freqs.keys()):
    print(key, pair_freqs[key])
    if i == 5:
        break

('T', 'h') 3
('h', 'i') 3
('i', 's') 5
('Ġ', 'i') 2
('Ġ', 't') 7
('t', 'h') 3

best_pair = ""
max_freq = None

for pair, freq in pair_freqs.items():
    if max_freq is None or freq > max_freq:
        max_freq = freq
        best_pair = pair

print(best_pair, max_freq)

('Ġ', 't') 7

merges = {best_pair: ''.join(best_pair)}
merges

{('Ġ', 't'): 'Ġt'}

merged_pattern = ''.join(best_pair)
vocab.append(merged_pattern)

def merge_pair(a, b, splits):
    for word in word_freq:
        split = splits[word]
        if len(split) == 1:
            continue

        i = 0
        while i < len(split) - 1:
            if split[i] == a and split[i + 1] == b:
                split = split[:i] + [a + b] + split[i + 2:]
            else:
                i += 1
        splits[word] = split
    return splits

splits['Ġtrained']

['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd']

splits = merge_pair(best_pair[0], best_pair[1], splits)
print(splits['Ġtrained'])

['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']

vocab_size = 50

while len(vocab) < vocab_size:
    pair_freqs = compute_pair_freqs(splits)
    best_pair = ""
    max_freq = None
    for pair, freq in pair_freqs.items():
        if max_freq is None or freq > max_freq:
            max_freq = freq
            best_pair = pair
    splits = merge_pair(*best_pair, splits)
    merges[best_pair] = ''.join(best_pair)
    vocab.append(merges[best_pair])

print(merges)

{('Ġ', 't'): 'Ġt', ('i', 's'): 'is', ('e', 'r'): 'er', ('Ġ', 'a'): 'Ġa', ('Ġt', 'o'): 'Ġto', ('e', 'n'): 'en', ('T', 'h'): 'Th', ('Th', 'is'): 'This', ('o', 'u'): 'ou', ('s', 'e'): 'se', ('Ġto', 'k'): 'Ġtok', ('Ġtok', 'en'): 'Ġtoken', ('n', 'd'): 'nd', ('Ġ', 'is'): 'Ġis', ('Ġt', 'h'): 'Ġth', ('Ġth', 'e'): 'Ġthe', ('i', 'n'): 'in', ('Ġa', 'b'): 'Ġab', ('Ġtoken', 'i'): 'Ġtokeni'}

print(vocab)

['<|endoftext|>', ',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ', 'Ġt', 'is', 'er', 'Ġa', 'Ġto', 'en', 'Th', 'This', 'ou', 'se', 'Ġtok', 'Ġtoken', 'nd', 'Ġis', 'Ġth', 'Ġthe', 'in', 'Ġab', 'Ġtokeni']