Commit
·
a59f066
1
Parent(s):
7756256
Upload folder using huggingface_hub
Browse files- attention.py +1 -1
- config.json +3 -3
- modeling_mpt.py +6 -1
- pytorch_model-00001-of-00007.bin +1 -1
- pytorch_model-00002-of-00007.bin +1 -1
- pytorch_model-00003-of-00007.bin +1 -1
- pytorch_model-00004-of-00007.bin +1 -1
- pytorch_model-00005-of-00007.bin +1 -1
- pytorch_model-00006-of-00007.bin +1 -1
- pytorch_model-00007-of-00007.bin +1 -1
- tokenizer.json +8 -1
- tokenizer_config.json +1 -1
attention.py
CHANGED
|
@@ -55,7 +55,7 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_
|
|
| 55 |
attn_weight = torch.softmax(attn_weight, dim=-1)
|
| 56 |
if dropout_p:
|
| 57 |
attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
|
| 58 |
-
out = attn_weight.matmul(v)
|
| 59 |
out = rearrange(out, 'b h s d -> b s (h d)')
|
| 60 |
if needs_weights:
|
| 61 |
return (out, attn_weight, past_key_value)
|
|
|
|
| 55 |
attn_weight = torch.softmax(attn_weight, dim=-1)
|
| 56 |
if dropout_p:
|
| 57 |
attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
|
| 58 |
+
out = attn_weight.to(v.dtype).matmul(v)
|
| 59 |
out = rearrange(out, 'b h s d -> b s (h d)')
|
| 60 |
if needs_weights:
|
| 61 |
return (out, attn_weight, past_key_value)
|
config.json
CHANGED
|
@@ -8,8 +8,8 @@
|
|
| 8 |
"attn_impl": "torch",
|
| 9 |
"attn_pdrop": 0,
|
| 10 |
"attn_type": "multihead_attention",
|
| 11 |
-
"attn_uses_sequence_id":
|
| 12 |
-
"clip_qkv":
|
| 13 |
"prefix_lm": false,
|
| 14 |
"qk_ln": false,
|
| 15 |
"softmax_scale": null
|
|
@@ -36,7 +36,7 @@
|
|
| 36 |
"init_device": "cpu",
|
| 37 |
"learned_pos_emb": true,
|
| 38 |
"logit_scale": null,
|
| 39 |
-
"max_seq_len":
|
| 40 |
"model_type": "mpt",
|
| 41 |
"n_heads": 64,
|
| 42 |
"n_layers": 48,
|
|
|
|
| 8 |
"attn_impl": "torch",
|
| 9 |
"attn_pdrop": 0,
|
| 10 |
"attn_type": "multihead_attention",
|
| 11 |
+
"attn_uses_sequence_id": false,
|
| 12 |
+
"clip_qkv": null,
|
| 13 |
"prefix_lm": false,
|
| 14 |
"qk_ln": false,
|
| 15 |
"softmax_scale": null
|
|
|
|
| 36 |
"init_device": "cpu",
|
| 37 |
"learned_pos_emb": true,
|
| 38 |
"logit_scale": null,
|
| 39 |
+
"max_seq_len": 8192,
|
| 40 |
"model_type": "mpt",
|
| 41 |
"n_heads": 64,
|
| 42 |
"n_layers": 48,
|
modeling_mpt.py
CHANGED
|
@@ -40,6 +40,11 @@ class MPTModel(MPTPreTrainedModel):
|
|
| 40 |
self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
|
| 41 |
self.alibi = config.attn_config['alibi']
|
| 42 |
self.alibi_bias_max = config.attn_config['alibi_bias_max']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
|
| 44 |
norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
|
| 45 |
raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
|
|
@@ -182,7 +187,7 @@ class MPTModel(MPTPreTrainedModel):
|
|
| 182 |
x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
|
| 183 |
assert isinstance(self.emb_drop, nn.Module)
|
| 184 |
x = self.emb_drop(x_shrunk)
|
| 185 |
-
(attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=
|
| 186 |
if use_cache and past_key_values is None:
|
| 187 |
past_key_values = [() for _ in range(self.config.n_layers)]
|
| 188 |
all_hidden_states = () if output_hidden_states else None
|
|
|
|
| 40 |
self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
|
| 41 |
self.alibi = config.attn_config['alibi']
|
| 42 |
self.alibi_bias_max = config.attn_config['alibi_bias_max']
|
| 43 |
+
if config.init_device == 'mixed':
|
| 44 |
+
if dist.get_local_rank() == 0:
|
| 45 |
+
config.init_device = 'cpu'
|
| 46 |
+
else:
|
| 47 |
+
config.init_device = 'meta'
|
| 48 |
if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
|
| 49 |
norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
|
| 50 |
raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
|
|
|
|
| 187 |
x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
|
| 188 |
assert isinstance(self.emb_drop, nn.Module)
|
| 189 |
x = self.emb_drop(x_shrunk)
|
| 190 |
+
(attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=torch.float32, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
|
| 191 |
if use_cache and past_key_values is None:
|
| 192 |
past_key_values = [() for _ in range(self.config.n_layers)]
|
| 193 |
all_hidden_states = () if output_hidden_states else None
|
pytorch_model-00001-of-00007.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9766157965
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb8229f99d31f14643324ae69c945bfc5c2548dee813ef23b7260b8d22ff3d82
|
| 3 |
size 9766157965
|
pytorch_model-00002-of-00007.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9865248775
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f4612b6e2b09839df9f3babfb1d00a49ff6eef3a8e6126a1e37732f59b9539c7
|
| 3 |
size 9865248775
|
pytorch_model-00003-of-00007.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9865248775
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:abf88070285848375c85b1fc5328b8b10485286b7c453a7a3e66e36f1be19aa8
|
| 3 |
size 9865248775
|
pytorch_model-00004-of-00007.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9865248775
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:028048cf61661c4c9173918053dad5aab752767f63d3e11133aee4f2c0431fbd
|
| 3 |
size 9865248775
|
pytorch_model-00005-of-00007.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9865248775
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5990a41d6a5a2418977a75622b00106043b68786cf85f221361b3711522290e8
|
| 3 |
size 9865248775
|
pytorch_model-00006-of-00007.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 9865248775
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7f090b3e9c31fef6b85569d537c86f1648bc61631e7d1fa83d498124bb7e752
|
| 3 |
size 9865248775
|
pytorch_model-00007-of-00007.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 822099468
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d1ab59fada51f526990510c5601e82d4f606214cfad59ebf5a333f6d12d63f1f
|
| 3 |
size 822099468
|
tokenizer.json
CHANGED
|
@@ -1,7 +1,14 @@
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
"truncation": null,
|
| 4 |
-
"padding":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"added_tokens": [
|
| 6 |
{
|
| 7 |
"id": 0,
|
|
|
|
| 1 |
{
|
| 2 |
"version": "1.0",
|
| 3 |
"truncation": null,
|
| 4 |
+
"padding": {
|
| 5 |
+
"strategy": "BatchLongest",
|
| 6 |
+
"direction": "Left",
|
| 7 |
+
"pad_to_multiple_of": null,
|
| 8 |
+
"pad_id": 0,
|
| 9 |
+
"pad_type_id": 0,
|
| 10 |
+
"pad_token": "<|endoftext|>"
|
| 11 |
+
},
|
| 12 |
"added_tokens": [
|
| 13 |
{
|
| 14 |
"id": 0,
|
tokenizer_config.json
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
"bos_token": "<|endoftext|>",
|
| 4 |
"clean_up_tokenization_spaces": true,
|
| 5 |
"eos_token": "<|endoftext|>",
|
| 6 |
-
"model_max_length":
|
| 7 |
"tokenizer_class": "GPTNeoXTokenizer",
|
| 8 |
"unk_token": "<|endoftext|>"
|
| 9 |
}
|
|
|
|
| 3 |
"bos_token": "<|endoftext|>",
|
| 4 |
"clean_up_tokenization_spaces": true,
|
| 5 |
"eos_token": "<|endoftext|>",
|
| 6 |
+
"model_max_length": 8192,
|
| 7 |
"tokenizer_class": "GPTNeoXTokenizer",
|
| 8 |
"unk_token": "<|endoftext|>"
|
| 9 |
}
|