Spaces:

flax-community
/

dalle-mini

Running

boris commited on Jul 14, 2021

Commit

dc5ae57

•

2 Parent(s): dcbf091 8884d40

Merge pull request #15 from borisdayma/feat-fix-lr

Files changed (3) hide show

requirements.txt CHANGED Viewed

@@ -7,3 +7,6 @@ jax[tpu]>=0.2.16
 -e git+https://github.com/huggingface/datasets.git@master#egg=datasets
 flax
 jupyter

 -e git+https://github.com/huggingface/datasets.git@master#egg=datasets
 flax
 jupyter
+# for logging
+tensorboard
+tetnsorflow

seq2seq/run_seq2seq_flax.py CHANGED Viewed

@@ -19,8 +19,11 @@ Script adapted from run_summarization_flax.py
 """
 # You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
-import logging as pylogging    # To avoid collision with transformers.utils.logging
 import os
 import sys
 import time
 from dataclasses import dataclass, field
@@ -673,12 +676,12 @@ def main():
             grads = jax.tree_map(lambda x: x / training_args.gradient_accumulation_steps, grad_accum)
             grads = jax.lax.pmean(grads, "batch")
             new_state = state.apply_gradients(
-                grads=grads, grad_accum=jax.tree_map(jnp.zeros_like, grads), optimizer_step=state.optimizer_step
             )
             return new_state
         new_state = jax.lax.cond(
-            state.step % training_args.gradient_accumulation_steps == 0,
             lambda _: update_fn(),
             lambda _: state.replace(grad_accum=grad_accum, step=state.step + 1),
             None,

 """
 # You can also adapt this script on your own sequence to sequence task. Pointers for this are left as comments.
 import os
+# set a common huggingface cache folder (used with datasets and transformers)
+os.environ['HF_HOME'] = '/data/huggingface/'   # required before importing transformers & datasets
+import logging as pylogging    # To avoid collision with transformers.utils.logging
 import sys
 import time
 from dataclasses import dataclass, field
             grads = jax.tree_map(lambda x: x / training_args.gradient_accumulation_steps, grad_accum)
             grads = jax.lax.pmean(grads, "batch")
             new_state = state.apply_gradients(
+                grads=grads, grad_accum=jax.tree_map(jnp.zeros_like, grads), optimizer_step=state.optimizer_step + 1
             )
             return new_state
         new_state = jax.lax.cond(
+            (state.step + 1) % training_args.gradient_accumulation_steps == 0,
             lambda _: update_fn(),
             lambda _: state.replace(grad_accum=grad_accum, step=state.step + 1),
             None,

seq2seq/sweep.yaml CHANGED Viewed

@@ -8,9 +8,9 @@ metric:
 parameters:
   learning_rate:
     distribution: log_uniform
-    # from exp(min) to exp(max), ie 1e-5 to 1e-3 on log scale
-    min: -11.5
-    max: -6.9
   gradient_accumulation_steps:
     value: 8
   warmup_steps:

 parameters:
   learning_rate:
     distribution: log_uniform
+    # from exp(min) to exp(max), ie 1e-4 to 5e-3 on log scale
+    min: -9.2
+    max: -5.3
   gradient_accumulation_steps:
     value: 8
   warmup_steps: