实现BERT

In [1]:

Copied!

import transformers
import torch

from custom_transformer import Encoder
import transformers
import torch

from custom_transformer import Encoder

huggingface中的模型通常对应一组config，存储模型的超参数。

In [2]:

Copied!

bert_config = transformers.BertConfig.from_pretrained('bert-base-uncased', local_files_only=True)
# Remove the pooler layer
bert_config
bert_config = transformers.BertConfig.from_pretrained('bert-base-uncased', local_files_only=True)
# Remove the pooler layer
bert_config

/opt/homebrew/lib/python3.11/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(

Out[2]:

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.41.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

首先，我们可以直接加载预训练的bert-base-uncased模型，用于和我们的模型进行对比。

In [3]:

Copied!





bert_tokenizer = transformers.BertTokenizer.from_pretrained(
    'bert-base-uncased', local_files_only=True
)
hf_model = transformers.BertModel.from_pretrained(
    'bert-base-uncased', config=bert_config, add_pooling_layer=False
)
hf_model
bert_tokenizer = transformers.BertTokenizer.from_pretrained(
    'bert-base-uncased', local_files_only=True
)
hf_model = transformers.BertModel.from_pretrained(
    'bert-base-uncased', config=bert_config, add_pooling_layer=False
)
hf_model

Out[3]:

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=3072, out_features=768, bias=True)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
)

接下来，我们根据config和模型的结构，实现一个BERT模型。

In [4]:

Copied!





class BertEmbeddings(torch.nn.Module):
    def __init__(self, config: transformers.BertConfig):
        super(BertEmbeddings, self).__init__()

        self.word_embeddings = torch.nn.Embedding(
            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
        )
        self.positional_embedding = torch.nn.Embedding(
            config.max_position_embeddings, config.hidden_size
        )
        self.token_type_embeddings = torch.nn.Embedding(
            config.type_vocab_size, config.hidden_size
        )
        self.ln = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids):
        input_embeds = self.word_embeddings(input_ids)
        pos_embeds = self.positional_embedding(
            torch.arange(input_ids.size(1), device=input_ids.device)
        )
        type_embeds = self.token_type_embeddings(
            torch.zeros_like(input_ids)
        )
        return self.dropout(self.ln(input_embeds + pos_embeds + type_embeds))

class CustomBertModel(torch.nn.Module):
    def __init__(self, config: transformers.BertConfig):
        super(CustomBertModel, self).__init__()
        self.shared = BertEmbeddings(config)
        self.encoder = Encoder(
            num_layers=config.num_hidden_layers,
            input_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            ffn_dim=config.intermediate_size,
            dropout=config.hidden_dropout_prob,
            layer_norm_eps=config.layer_norm_eps,
            activation=config.hidden_act,
        )

    def forward(self, input_ids, padding_mask):
        input_embeds = self.shared(input_ids)
        return self.encoder(input_embeds, padding_mask=padding_mask)

custom_model = CustomBertModel(bert_config)
custom_model
class BertEmbeddings(torch.nn.Module):
    def __init__(self, config: transformers.BertConfig):
        super(BertEmbeddings, self).__init__()

        self.word_embeddings = torch.nn.Embedding(
            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
        )
        self.positional_embedding = torch.nn.Embedding(
            config.max_position_embeddings, config.hidden_size
        )
        self.token_type_embeddings = torch.nn.Embedding(
            config.type_vocab_size, config.hidden_size
        )
        self.ln = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids):
        input_embeds = self.word_embeddings(input_ids)
        pos_embeds = self.positional_embedding(
            torch.arange(input_ids.size(1), device=input_ids.device)
        )
        type_embeds = self.token_type_embeddings(
            torch.zeros_like(input_ids)
        )
        return self.dropout(self.ln(input_embeds + pos_embeds + type_embeds))

class CustomBertModel(torch.nn.Module):
    def __init__(self, config: transformers.BertConfig):
        super(CustomBertModel, self).__init__()
        self.shared = BertEmbeddings(config)
        self.encoder = Encoder(
            num_layers=config.num_hidden_layers,
            input_dim=config.hidden_size,
            num_heads=config.num_attention_heads,
            ffn_dim=config.intermediate_size,
            dropout=config.hidden_dropout_prob,
            layer_norm_eps=config.layer_norm_eps,
            activation=config.hidden_act,
        )

    def forward(self, input_ids, padding_mask):
        input_embeds = self.shared(input_ids)
        return self.encoder(input_embeds, padding_mask=padding_mask)

custom_model = CustomBertModel(bert_config)
custom_model

Out[4]:

CustomBertModel(
  (shared): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (positional_embedding): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (ln): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (layers): ModuleList(
      (0-11): 12 x EncoderLayer(
        (attention): MultiHeadSelfAttention(
          (W_Q): Linear(in_features=768, out_features=768, bias=True)
          (W_K): Linear(in_features=768, out_features=768, bias=True)
          (W_V): Linear(in_features=768, out_features=768, bias=True)
          (W_O): Linear(in_features=768, out_features=768, bias=True)
        )
        (norm1): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (ffn): FFN(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (act): GELU(approximate='none')
        )
        (norm2): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
)

我们需要实现一个函数，将预训练模型的权重加载到我们的模型中。

In [5]:

Copied!





def copy_weights(src, dest, bias: bool = False):
    src.weight.data.copy_(dest.weight.data)
    if bias:
        src.bias.data.copy_(dest.bias.data)

def load_weight_from_hf(model: CustomBertModel, hf_model: transformers.BertModel):
    # Embeddings
    unbiased_layer_pairs = [
        (model.shared.word_embeddings, hf_model.embeddings.word_embeddings),
        (model.shared.positional_embedding, hf_model.embeddings.position_embeddings),
        (model.shared.token_type_embeddings, hf_model.embeddings.token_type_embeddings),
    ]
    biased_layer_pairs = [
        (model.shared.ln, hf_model.embeddings.LayerNorm),
    ]
    for custom_layer, hf_layer in zip(model.encoder.layers, hf_model.encoder.layer):
        # attn
        biased_layer_pairs.append((custom_layer.attention.W_Q, hf_layer.attention.self.query))
        biased_layer_pairs.append((custom_layer.attention.W_K, hf_layer.attention.self.key))
        biased_layer_pairs.append((custom_layer.attention.W_V, hf_layer.attention.self.value))
        biased_layer_pairs.append((custom_layer.attention.W_O, hf_layer.attention.output.dense))
        biased_layer_pairs.append((custom_layer.norm1, hf_layer.attention.output.LayerNorm))
        # ffn
        biased_layer_pairs.append((custom_layer.ffn.fc1, hf_layer.intermediate.dense))
        biased_layer_pairs.append((custom_layer.ffn.fc2, hf_layer.output.dense))
        biased_layer_pairs.append((custom_layer.norm2, hf_layer.output.LayerNorm))

    for src, dest in unbiased_layer_pairs:
        copy_weights(src, dest)
    for src, dest in biased_layer_pairs:
        copy_weights(src, dest, bias=True)
    return model
def copy_weights(src, dest, bias: bool = False):
    src.weight.data.copy_(dest.weight.data)
    if bias:
        src.bias.data.copy_(dest.bias.data)

def load_weight_from_hf(model: CustomBertModel, hf_model: transformers.BertModel):
    # Embeddings
    unbiased_layer_pairs = [
        (model.shared.word_embeddings, hf_model.embeddings.word_embeddings),
        (model.shared.positional_embedding, hf_model.embeddings.position_embeddings),
        (model.shared.token_type_embeddings, hf_model.embeddings.token_type_embeddings),
    ]
    biased_layer_pairs = [
        (model.shared.ln, hf_model.embeddings.LayerNorm),
    ]
    for custom_layer, hf_layer in zip(model.encoder.layers, hf_model.encoder.layer):
        # attn
        biased_layer_pairs.append((custom_layer.attention.W_Q, hf_layer.attention.self.query))
        biased_layer_pairs.append((custom_layer.attention.W_K, hf_layer.attention.self.key))
        biased_layer_pairs.append((custom_layer.attention.W_V, hf_layer.attention.self.value))
        biased_layer_pairs.append((custom_layer.attention.W_O, hf_layer.attention.output.dense))
        biased_layer_pairs.append((custom_layer.norm1, hf_layer.attention.output.LayerNorm))
        # ffn
        biased_layer_pairs.append((custom_layer.ffn.fc1, hf_layer.intermediate.dense))
        biased_layer_pairs.append((custom_layer.ffn.fc2, hf_layer.output.dense))
        biased_layer_pairs.append((custom_layer.norm2, hf_layer.output.LayerNorm))

    for src, dest in unbiased_layer_pairs:
        copy_weights(src, dest)
    for src, dest in biased_layer_pairs:
        copy_weights(src, dest, bias=True)
    return model

In [6]:

Copied!

custom_model = load_weight_from_hf(custom_model, hf_model)
custom_model = load_weight_from_hf(custom_model, hf_model)

使用BERT编码一句话，首先需要对句子分词，然后将其输入模型得到对应的输出。

In [7]:

Copied!





custom_model.eval()
hf_model.eval()

sentence = "Hello, World!"
tokenized_sentence = bert_tokenizer(sentence, return_tensors='pt')
input_ids = tokenized_sentence['input_ids']
padding_mask = tokenized_sentence['attention_mask']

custom_output = custom_model(input_ids, padding_mask)
hf_output = hf_model(input_ids, attention_mask=padding_mask).last_hidden_state
custom_output[0, 0, :5], hf_output[0, 0, :5]
custom_model.eval()
hf_model.eval()

sentence = "Hello, World!"
tokenized_sentence = bert_tokenizer(sentence, return_tensors='pt')
input_ids = tokenized_sentence['input_ids']
padding_mask = tokenized_sentence['attention_mask']

custom_output = custom_model(input_ids, padding_mask)
hf_output = hf_model(input_ids, attention_mask=padding_mask).last_hidden_state
custom_output[0, 0, :5], hf_output[0, 0, :5]

Out[7]:

(tensor([-0.0781,  0.1587,  0.0400, -0.1986, -0.3442], grad_fn=<SliceBackward0>),
 tensor([-0.0781,  0.1587,  0.0400, -0.1986, -0.3442], grad_fn=<SliceBackward0>))

In [8]:

Copied!

torch.allclose(custom_output, hf_output, atol=1e-5)
torch.allclose(custom_output, hf_output, atol=1e-5)

Out[8]:

True

实现BERT

评论