트랜스포머
import kerasfrom collections import namedtuple
config = namedtuple('config',
['시퀀스길이', '어휘수', '임베딩차원', '헤드수'])(
시퀀스길이=512, 어휘수=10000, 임베딩차원=256, 헤드수=8)임베딩¶
def build_embedding(입력시퀀스, 어휘수, 임베딩차원):
시퀀스길이 = 입력시퀀스.shape[1]
위치인덱스 = keras.ops.arange(0, 시퀀스길이, dtype='int32')
단어임베딩 = keras.layers.Embedding(input_dim=어휘수, output_dim=임베딩차원, name='word_embedding')
위치임베딩 = keras.layers.Embedding(input_dim=시퀀스길이, output_dim=임베딩차원, name='position_embedding')
x = 단어임베딩(입력시퀀스)
위치벡터 = 위치임베딩(위치인덱스)
x = x + 위치벡터
return keras.Model(inputs=입력시퀀스, outputs=x, name='embedding')
입력시퀀스 = keras.Input(shape=(config.시퀀스길이,), dtype='int32', name='input_sequence')
embedding = build_embedding(입력시퀀스, config.어휘수, config.임베딩차원)
embedding.summary()Encoder¶
def build_encoder(임베딩차원, 헤드수, name=None, **config):
encoder_input = keras.Input(shape=(None, 임베딩차원), name='encoder_input')
attention_mask = keras.Input(shape=(None,), dtype='float32', name='attention_mask')
attn = keras.layers.MultiHeadAttention(
num_heads=헤드수, key_dim=임베딩차원 // 헤드수, dropout=0.1, name=f'self_attention')
x = attn(query=encoder_input, value=encoder_input, attention_mask=attention_mask)
x = keras.layers.Dropout(0.1)(x)
x = x + encoder_input # 잔차 연결 (residual connection)
x = keras.layers.LayerNormalization(epsilon=1e-6, name=f'layer_norm_1')(x)
x0 = x
잠재차원 = config.get('잠재차원', 임베딩차원 * 4)
mlp = keras.Sequential([
keras.layers.Dense(잠재차원, activation='relu'),
keras.layers.Dropout(0.1),
keras.layers.Dense(임베딩차원),
], name='mlp')
x = mlp(x)
x = x + x0 # 잔차 연결 (residual connection)
x = keras.layers.LayerNormalization(epsilon=1e-6, name=f'layer_norm_2')(x)
return keras.Model(inputs=[encoder_input, attention_mask], outputs=x, name=name)
keras.backend.clear_session()
입력시퀀스 = keras.Input(shape=(config.시퀀스길이,), dtype='int32', name='input_sequence')
x = build_embedding(입력시퀀스, config.어휘수, config.임베딩차원)(입력시퀀스)
어텐션마스크 = keras.Input(shape=(config.시퀀스길이,), dtype='float32', name='attention_mask')
encoder = build_encoder(임베딩차원=config.임베딩차원, 헤드수=config.헤드수, name='encoder')
encoder.summary()Transformer¶
def build_encoder_stack(임베딩차원, 헤드수, 층수, name=None):
encoder_input = keras.Input(shape=(None, 임베딩차원), name='encoder_stack_input')
attention_mask = keras.Input(shape=(None,), dtype='float32', name='encoder_stack_attention_mask')
x = encoder_input
for L in range(층수):
encoder = build_encoder(임베딩차원, 헤드수, name=f'encoder_{L+1}')
x = encoder([x, attention_mask])
return keras.Model(inputs=[encoder_input, attention_mask], outputs=x, name=name)
keras.backend.clear_session()
입력시퀀스 = keras.Input(shape=(config.시퀀스길이,), dtype='int32', name='input_sequence')
x = build_embedding(입력시퀀스, config.어휘수, config.임베딩차원)(입력시퀀스)
어텐션마스크 = keras.Input(shape=(config.시퀀스길이,), dtype='float32', name='attention_mask')
encoder_stack = build_encoder_stack(임베딩차원=config.임베딩차원, 헤드수=config.헤드수, 층수=8, name='encoder_stack')
x = encoder_stack([x, 어텐션마스크])
model = keras.Model(inputs=[입력시퀀스, 어텐션마스크], outputs=x, name='transformer')
model.summary()
model.get_layer('encoder_stack').summary()keras.utils.plot_model(
model,
show_shapes=True,
show_layer_names=True,
to_file='transformer.png')keras.utils.plot_model(
model.get_layer('encoder_stack'),
show_shapes=True,
show_layer_names=True,
to_file='encoder_stack.png')