Upload folder using huggingface_hub

Files changed (12) hide show

attention.py CHANGED Viewed

@@ -55,7 +55,7 @@ def scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_
     attn_weight = torch.softmax(attn_weight, dim=-1)
     if dropout_p:
         attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
-    out = attn_weight.matmul(v)
     out = rearrange(out, 'b h s d -> b s (h d)')
     if needs_weights:
         return (out, attn_weight, past_key_value)

     attn_weight = torch.softmax(attn_weight, dim=-1)
     if dropout_p:
         attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
+    out = attn_weight.to(v.dtype).matmul(v)
     out = rearrange(out, 'b h s d -> b s (h d)')
     if needs_weights:
         return (out, attn_weight, past_key_value)

config.json CHANGED Viewed

@@ -8,8 +8,8 @@
     "attn_impl": "torch",
     "attn_pdrop": 0,
     "attn_type": "multihead_attention",
-    "attn_uses_sequence_id": true,
-    "clip_qkv": 6,
     "prefix_lm": false,
     "qk_ln": false,
     "softmax_scale": null
@@ -36,7 +36,7 @@
   "init_device": "cpu",
   "learned_pos_emb": true,
   "logit_scale": null,
-  "max_seq_len": 2048,
   "model_type": "mpt",
   "n_heads": 64,
   "n_layers": 48,

     "attn_impl": "torch",
     "attn_pdrop": 0,
     "attn_type": "multihead_attention",
+    "attn_uses_sequence_id": false,
+    "clip_qkv": null,
     "prefix_lm": false,
     "qk_ln": false,
     "softmax_scale": null
   "init_device": "cpu",
   "learned_pos_emb": true,
   "logit_scale": null,
+  "max_seq_len": 8192,
   "model_type": "mpt",
   "n_heads": 64,
   "n_layers": 48,

modeling_mpt.py CHANGED Viewed

@@ -40,6 +40,11 @@ class MPTModel(MPTPreTrainedModel):
         self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
         self.alibi = config.attn_config['alibi']
         self.alibi_bias_max = config.attn_config['alibi_bias_max']
         if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
             norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
             raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
@@ -182,7 +187,7 @@ class MPTModel(MPTPreTrainedModel):
             x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
             assert isinstance(self.emb_drop, nn.Module)
             x = self.emb_drop(x_shrunk)
-        (attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=x.dtype, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
         if use_cache and past_key_values is None:
             past_key_values = [() for _ in range(self.config.n_layers)]
         all_hidden_states = () if output_hidden_states else None

         self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
         self.alibi = config.attn_config['alibi']
         self.alibi_bias_max = config.attn_config['alibi_bias_max']
+        if config.init_device == 'mixed':
+            if dist.get_local_rank() == 0:
+                config.init_device = 'cpu'
+            else:
+                config.init_device = 'meta'
         if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
             norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
             raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
             x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
             assert isinstance(self.emb_drop, nn.Module)
             x = self.emb_drop(x_shrunk)
+        (attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=torch.float32, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
         if use_cache and past_key_values is None:
             past_key_values = [() for _ in range(self.config.n_layers)]
         all_hidden_states = () if output_hidden_states else None

pytorch_model-00001-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:81689887c0b6b69e85ba724f5c5f2b723ab9af1e41ffcdf5978417023abfb567
 size 9766157965

 version https://git-lfs.github.com/spec/v1
+oid sha256:bb8229f99d31f14643324ae69c945bfc5c2548dee813ef23b7260b8d22ff3d82
 size 9766157965

pytorch_model-00002-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:06b7ba1da683bc45b768d48eafa1ee4faf82b61091525a8f7ea1a60cc2ec34ec
 size 9865248775

 version https://git-lfs.github.com/spec/v1
+oid sha256:f4612b6e2b09839df9f3babfb1d00a49ff6eef3a8e6126a1e37732f59b9539c7
 size 9865248775

pytorch_model-00003-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:966f90ff885aa3349fefb60b6d67bf90054ba64c51d4603db6cfce62e1d7a625
 size 9865248775

 version https://git-lfs.github.com/spec/v1
+oid sha256:abf88070285848375c85b1fc5328b8b10485286b7c453a7a3e66e36f1be19aa8
 size 9865248775

pytorch_model-00004-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9187a71cc8dbc91e7fe4abe99609eda468027c98983c4314ef3deb18a923aec9
 size 9865248775

 version https://git-lfs.github.com/spec/v1
+oid sha256:028048cf61661c4c9173918053dad5aab752767f63d3e11133aee4f2c0431fbd
 size 9865248775

pytorch_model-00005-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:788d60e164191f46fe72d24191df4da1cc2efafb6055f47dc52642f18f4db639
 size 9865248775

 version https://git-lfs.github.com/spec/v1
+oid sha256:5990a41d6a5a2418977a75622b00106043b68786cf85f221361b3711522290e8
 size 9865248775

pytorch_model-00006-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0bab4477c25ad39e73ace5a9ad79a3d9cdee8939ffd06bfc26843d54c6b0a956
 size 9865248775

 version https://git-lfs.github.com/spec/v1
+oid sha256:c7f090b3e9c31fef6b85569d537c86f1648bc61631e7d1fa83d498124bb7e752
 size 9865248775

pytorch_model-00007-of-00007.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33f1b1ad786d84e232016512b0831c1e032dbcdf7250c767ac7d9c068c647425
 size 822099468

 version https://git-lfs.github.com/spec/v1
+oid sha256:d1ab59fada51f526990510c5601e82d4f606214cfad59ebf5a333f6d12d63f1f
 size 822099468

tokenizer.json CHANGED Viewed

@@ -1,7 +1,14 @@
 {
   "version": "1.0",
   "truncation": null,
-  "padding": null,
   "added_tokens": [
     {
       "id": 0,

 {
   "version": "1.0",
   "truncation": null,
+  "padding": {
+    "strategy": "BatchLongest",
+    "direction": "Left",
+    "pad_to_multiple_of": null,
+    "pad_id": 0,
+    "pad_type_id": 0,
+    "pad_token": "<|endoftext|>"
+  },
   "added_tokens": [
     {
       "id": 0,

tokenizer_config.json CHANGED Viewed

@@ -3,7 +3,7 @@
   "bos_token": "<|endoftext|>",
   "clean_up_tokenization_spaces": true,
   "eos_token": "<|endoftext|>",
-  "model_max_length": 2048,
   "tokenizer_class": "GPTNeoXTokenizer",
   "unk_token": "<|endoftext|>"
 }

   "bos_token": "<|endoftext|>",
   "clean_up_tokenization_spaces": true,
   "eos_token": "<|endoftext|>",
+  "model_max_length": 8192,
   "tokenizer_class": "GPTNeoXTokenizer",
   "unk_token": "<|endoftext|>"
 }