BERT
In [1]:
Copied!
import transformers
import torch
from custom_transformer import Encoder
import transformers
import torch
from custom_transformer import Encoder
huggingface中的模型通常对应一组config,存储模型的超参数。
In [2]:
Copied!
bert_config = transformers.BertConfig.from_pretrained('bert-base-uncased', local_files_only=True)
# Remove the pooler layer
bert_config
bert_config = transformers.BertConfig.from_pretrained('bert-base-uncased', local_files_only=True)
# Remove the pooler layer
bert_config
/opt/homebrew/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`. warnings.warn(
Out[2]:
BertConfig { "architectures": [ "BertForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "classifier_dropout": null, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-12, "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 12, "num_hidden_layers": 12, "pad_token_id": 0, "position_embedding_type": "absolute", "transformers_version": "4.41.2", "type_vocab_size": 2, "use_cache": true, "vocab_size": 30522 }
首先,我们可以直接加载预训练的bert-base-uncased模型,用于和我们的模型进行对比。
In [3]:
Copied!
bert_tokenizer = transformers.BertTokenizer.from_pretrained(
'bert-base-uncased', local_files_only=True
)
hf_model = transformers.BertModel.from_pretrained(
'bert-base-uncased', config=bert_config, add_pooling_layer=False
)
hf_model
bert_tokenizer = transformers.BertTokenizer.from_pretrained(
'bert-base-uncased', local_files_only=True
)
hf_model = transformers.BertModel.from_pretrained(
'bert-base-uncased', config=bert_config, add_pooling_layer=False
)
hf_model
Out[3]:
BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(30522, 768, padding_idx=0) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0-11): 12 x BertLayer( (attention): BertAttention( (self): BertSdpaSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) )
接下来,我们根据config和模型的结构,实现一个BERT模型。
In [4]:
Copied!
class BertEmbeddings(torch.nn.Module):
def __init__(self, config: transformers.BertConfig):
super(BertEmbeddings, self).__init__()
self.word_embeddings = torch.nn.Embedding(
config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
)
self.positional_embedding = torch.nn.Embedding(
config.max_position_embeddings, config.hidden_size
)
self.token_type_embeddings = torch.nn.Embedding(
config.type_vocab_size, config.hidden_size
)
self.ln = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
def forward(self, input_ids):
input_embeds = self.word_embeddings(input_ids)
pos_embeds = self.positional_embedding(
torch.arange(input_ids.size(1), device=input_ids.device)
)
type_embeds = self.token_type_embeddings(
torch.zeros_like(input_ids)
)
return self.dropout(self.ln(input_embeds + pos_embeds + type_embeds))
class CustomBertModel(torch.nn.Module):
def __init__(self, config: transformers.BertConfig):
super(CustomBertModel, self).__init__()
self.shared = BertEmbeddings(config)
self.encoder = Encoder(
num_layers=config.num_hidden_layers,
input_dim=config.hidden_size,
num_heads=config.num_attention_heads,
ffn_dim=config.intermediate_size,
dropout=config.hidden_dropout_prob,
layer_norm_eps=config.layer_norm_eps,
activation=config.hidden_act,
)
def forward(self, input_ids, padding_mask):
input_embeds = self.shared(input_ids)
return self.encoder(input_embeds, padding_mask=padding_mask)
custom_model = CustomBertModel(bert_config)
custom_model
class BertEmbeddings(torch.nn.Module):
def __init__(self, config: transformers.BertConfig):
super(BertEmbeddings, self).__init__()
self.word_embeddings = torch.nn.Embedding(
config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
)
self.positional_embedding = torch.nn.Embedding(
config.max_position_embeddings, config.hidden_size
)
self.token_type_embeddings = torch.nn.Embedding(
config.type_vocab_size, config.hidden_size
)
self.ln = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
def forward(self, input_ids):
input_embeds = self.word_embeddings(input_ids)
pos_embeds = self.positional_embedding(
torch.arange(input_ids.size(1), device=input_ids.device)
)
type_embeds = self.token_type_embeddings(
torch.zeros_like(input_ids)
)
return self.dropout(self.ln(input_embeds + pos_embeds + type_embeds))
class CustomBertModel(torch.nn.Module):
def __init__(self, config: transformers.BertConfig):
super(CustomBertModel, self).__init__()
self.shared = BertEmbeddings(config)
self.encoder = Encoder(
num_layers=config.num_hidden_layers,
input_dim=config.hidden_size,
num_heads=config.num_attention_heads,
ffn_dim=config.intermediate_size,
dropout=config.hidden_dropout_prob,
layer_norm_eps=config.layer_norm_eps,
activation=config.hidden_act,
)
def forward(self, input_ids, padding_mask):
input_embeds = self.shared(input_ids)
return self.encoder(input_embeds, padding_mask=padding_mask)
custom_model = CustomBertModel(bert_config)
custom_model
Out[4]:
CustomBertModel( (shared): BertEmbeddings( (word_embeddings): Embedding(30522, 768, padding_idx=0) (positional_embedding): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (ln): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): Encoder( (layers): ModuleList( (0-11): 12 x EncoderLayer( (attention): MultiHeadSelfAttention( (W_Q): Linear(in_features=768, out_features=768, bias=True) (W_K): Linear(in_features=768, out_features=768, bias=True) (W_V): Linear(in_features=768, out_features=768, bias=True) (W_O): Linear(in_features=768, out_features=768, bias=True) ) (norm1): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (ffn): FFN( (fc1): Linear(in_features=768, out_features=3072, bias=True) (fc2): Linear(in_features=3072, out_features=768, bias=True) (act): GELU(approximate='none') ) (norm2): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout2): Dropout(p=0.1, inplace=False) ) ) ) )
我们需要实现一个函数,将预训练模型的权重加载到我们的模型中。
In [5]:
Copied!
def copy_weights(src, dest, bias: bool = False):
src.weight.data.copy_(dest.weight.data)
if bias:
src.bias.data.copy_(dest.bias.data)
def load_weight_from_hf(model: CustomBertModel, hf_model: transformers.BertModel):
# Embeddings
unbiased_layer_pairs = [
(model.shared.word_embeddings, hf_model.embeddings.word_embeddings),
(model.shared.positional_embedding, hf_model.embeddings.position_embeddings),
(model.shared.token_type_embeddings, hf_model.embeddings.token_type_embeddings),
]
biased_layer_pairs = [
(model.shared.ln, hf_model.embeddings.LayerNorm),
]
for custom_layer, hf_layer in zip(model.encoder.layers, hf_model.encoder.layer):
# attn
biased_layer_pairs.append((custom_layer.attention.W_Q, hf_layer.attention.self.query))
biased_layer_pairs.append((custom_layer.attention.W_K, hf_layer.attention.self.key))
biased_layer_pairs.append((custom_layer.attention.W_V, hf_layer.attention.self.value))
biased_layer_pairs.append((custom_layer.attention.W_O, hf_layer.attention.output.dense))
biased_layer_pairs.append((custom_layer.norm1, hf_layer.attention.output.LayerNorm))
# ffn
biased_layer_pairs.append((custom_layer.ffn.fc1, hf_layer.intermediate.dense))
biased_layer_pairs.append((custom_layer.ffn.fc2, hf_layer.output.dense))
biased_layer_pairs.append((custom_layer.norm2, hf_layer.output.LayerNorm))
for src, dest in unbiased_layer_pairs:
copy_weights(src, dest)
for src, dest in biased_layer_pairs:
copy_weights(src, dest, bias=True)
return model
def copy_weights(src, dest, bias: bool = False):
src.weight.data.copy_(dest.weight.data)
if bias:
src.bias.data.copy_(dest.bias.data)
def load_weight_from_hf(model: CustomBertModel, hf_model: transformers.BertModel):
# Embeddings
unbiased_layer_pairs = [
(model.shared.word_embeddings, hf_model.embeddings.word_embeddings),
(model.shared.positional_embedding, hf_model.embeddings.position_embeddings),
(model.shared.token_type_embeddings, hf_model.embeddings.token_type_embeddings),
]
biased_layer_pairs = [
(model.shared.ln, hf_model.embeddings.LayerNorm),
]
for custom_layer, hf_layer in zip(model.encoder.layers, hf_model.encoder.layer):
# attn
biased_layer_pairs.append((custom_layer.attention.W_Q, hf_layer.attention.self.query))
biased_layer_pairs.append((custom_layer.attention.W_K, hf_layer.attention.self.key))
biased_layer_pairs.append((custom_layer.attention.W_V, hf_layer.attention.self.value))
biased_layer_pairs.append((custom_layer.attention.W_O, hf_layer.attention.output.dense))
biased_layer_pairs.append((custom_layer.norm1, hf_layer.attention.output.LayerNorm))
# ffn
biased_layer_pairs.append((custom_layer.ffn.fc1, hf_layer.intermediate.dense))
biased_layer_pairs.append((custom_layer.ffn.fc2, hf_layer.output.dense))
biased_layer_pairs.append((custom_layer.norm2, hf_layer.output.LayerNorm))
for src, dest in unbiased_layer_pairs:
copy_weights(src, dest)
for src, dest in biased_layer_pairs:
copy_weights(src, dest, bias=True)
return model
In [6]:
Copied!
custom_model = load_weight_from_hf(custom_model, hf_model)
custom_model = load_weight_from_hf(custom_model, hf_model)
使用BERT编码一句话,首先需要对句子分词,然后将其输入模型得到对应的输出。
In [7]:
Copied!
custom_model.eval()
hf_model.eval()
sentence = "Hello, World!"
tokenized_sentence = bert_tokenizer(sentence, return_tensors='pt')
input_ids = tokenized_sentence['input_ids']
padding_mask = tokenized_sentence['attention_mask']
custom_output = custom_model(input_ids, padding_mask)
hf_output = hf_model(input_ids, attention_mask=padding_mask).last_hidden_state
custom_output[0, 0, :5], hf_output[0, 0, :5]
custom_model.eval()
hf_model.eval()
sentence = "Hello, World!"
tokenized_sentence = bert_tokenizer(sentence, return_tensors='pt')
input_ids = tokenized_sentence['input_ids']
padding_mask = tokenized_sentence['attention_mask']
custom_output = custom_model(input_ids, padding_mask)
hf_output = hf_model(input_ids, attention_mask=padding_mask).last_hidden_state
custom_output[0, 0, :5], hf_output[0, 0, :5]
Out[7]:
(tensor([-0.0781, 0.1587, 0.0400, -0.1986, -0.3442], grad_fn=<SliceBackward0>), tensor([-0.0781, 0.1587, 0.0400, -0.1986, -0.3442], grad_fn=<SliceBackward0>))
In [8]:
Copied!
torch.allclose(custom_output, hf_output, atol=1e-5)
torch.allclose(custom_output, hf_output, atol=1e-5)
Out[8]:
True