Spaces:

flax-community
/

dalle-mini

Running

valhalla commited on Oct 19, 2021

Commit

f6c4cb2

1 Parent(s): 29db327

make checkpointing optional

Files changed (1) hide show

dalle_mini/modeling_bart_flax.py CHANGED Viewed

@@ -252,8 +252,7 @@ class FlaxBartEncoderLayer(nn.Module):
             kernel_init=jax.nn.initializers.normal(self.config.init_std),
         )
         self.final_layer_norm = nn.LayerNorm(dtype=self.dtype)
-    @nn.remat
     def __call__(
         self,
         hidden_states: jnp.ndarray,
@@ -283,8 +282,9 @@ class FlaxBartEncoderLayerCollection(nn.Module):
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
         self.layers = [
-            FlaxBartEncoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.encoder_layers)
         ]
     def __call__(
@@ -344,8 +344,7 @@ class FlaxBartDecoderLayer(nn.Module):
             kernel_init=jax.nn.initializers.normal(self.config.init_std),
         )
         self.final_layer_norm = nn.LayerNorm(dtype=self.dtype)
-    @nn.remat
     def __call__(
         self,
         hidden_states: jnp.ndarray,
@@ -394,8 +393,9 @@ class FlaxBartDecoderLayerCollection(nn.Module):
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
         self.layers = [
-            FlaxBartDecoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.decoder_layers)
         ]
     def __call__(

             kernel_init=jax.nn.initializers.normal(self.config.init_std),
         )
         self.final_layer_norm = nn.LayerNorm(dtype=self.dtype)
     def __call__(
         self,
         hidden_states: jnp.ndarray,
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
+        layer_module = nn.remat(FlaxBartEncoderLayer) if self.config.gradient_checkpointing else FlaxBartEncoderLayer
         self.layers = [
+            layer_module(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.encoder_layers)
         ]
     def __call__(
             kernel_init=jax.nn.initializers.normal(self.config.init_std),
         )
         self.final_layer_norm = nn.LayerNorm(dtype=self.dtype)
     def __call__(
         self,
         hidden_states: jnp.ndarray,
     dtype: jnp.dtype = jnp.float32  # the dtype of the computation
     def setup(self):
+        layer_module = nn.remat(FlaxBartDecoderLayer) if self.config.gradient_checkpointing else FlaxBartDecoderLayer
         self.layers = [
+            layer_module(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.decoder_layers)
         ]
     def __call__(