lots of minor updates

Browse files

Files changed (5) hide show

run.sh → run_cc.sh +12 -9
run_npsc.sh +9 -7
run_nst.sh +9 -6
run_whisper_finetuning.py +23 -28
run_xla.sh +43 -0

run.sh → run_cc.sh RENAMED Viewed

@@ -1,7 +1,11 @@
 python run_whisper_finetuning.py \
 	--model_name_or_path="openai/whisper-small" \
-	--output_dir="../whisper-testrun1" \
 	--overwrite_output_dir=True \
 	--language="Norwegian" \
 	--task="transcribe" \
@@ -11,18 +15,18 @@ python run_whisper_finetuning.py \
 	--do_eval=True \
 	--audio_column_name="audio" \
 	--text_column_name="sentence" \
-	--per_device_train_batch_size=32 \
-    --per_device_train_batch_size=32 \
-	--learning_rate=2e-5 \
 	--warmup_steps=500 \
-	--max_steps=10000 \
 	--gradient_checkpointing=True \
 	--gradient_accumulation_steps=1 \
 	--group_by_length=False \
 	--evaluation_strategy="steps" \
-	--save_steps=1000 \
-	--eval_steps=1000 \
-	--max_eval_samples=10 \
 	--logging_steps=250 \
 	--fp16=True \
 	--load_best_model_at_end=True \
@@ -34,5 +38,4 @@ python run_whisper_finetuning.py \
 	--print_training_arguments=True \
 	--push_to_hub=True

+# Whisper Finetuning script for the very small Nynorsk Common Voice
+# Test script. Currently this only runs if you uncomment the lines in the training code for casting this from 48K to 16K
+# Currently for training on a 48GB
+# Reduce batch size and learning rate if training on smaller GPU
 python run_whisper_finetuning.py \
 	--model_name_or_path="openai/whisper-small" \
+	--output_dir="../whisper-CC-nn" \
 	--overwrite_output_dir=True \
 	--language="Norwegian" \
 	--task="transcribe" \
 	--do_eval=True \
 	--audio_column_name="audio" \
 	--text_column_name="sentence" \
+	--per_device_train_batch_size=48 \
+    --per_device_train_batch_size=48 \
+	--learning_rate=4e-5 \
 	--warmup_steps=500 \
+	--max_steps=1000 \
 	--gradient_checkpointing=True \
 	--gradient_accumulation_steps=1 \
 	--group_by_length=False \
 	--evaluation_strategy="steps" \
+	--save_steps=250 \
+	--eval_steps=250 \
+	--max_eval_samples=50 \
 	--logging_steps=250 \
 	--fp16=True \
 	--load_best_model_at_end=True \
 	--print_training_arguments=True \
 	--push_to_hub=True

run_npsc.sh CHANGED Viewed

@@ -1,7 +1,10 @@
 python run_whisper_finetuning.py \
 	--model_name_or_path="openai/whisper-small" \
-	--output_dir="../whisper-testrun1" \
 	--overwrite_output_dir=True \
 	--language="Norwegian" \
 	--task="transcribe" \
@@ -10,18 +13,19 @@ python run_whisper_finetuning.py \
 	--do_train=True \
 	--do_eval=True \
 	--audio_column_name="audio" \
-	--text_column_name="text" \
 	--per_device_train_batch_size=16 \
-       	--per_device_train_batch_size=16 \
-	--learning_rate=2e-5 \
 	--warmup_steps=500 \
-	--max_steps=10000 \
 	--gradient_checkpointing=True \
 	--gradient_accumulation_steps=1 \
 	--group_by_length=False \
 	--evaluation_strategy="steps" \
 	--save_steps=1000 \
 	--eval_steps=1000 \
 	--logging_steps=250 \
 	--fp16=True \
 	--load_best_model_at_end=True \
@@ -33,5 +37,3 @@ python run_whisper_finetuning.py \
 	--print_training_arguments=True \
 	--push_to_hub=True

+# Whisper Finetuning script for the NPSC_orto dataset
+# Currently for training on a 48GB
+# Reduce batch size and learning rate if training on smaller GPU
 python run_whisper_finetuning.py \
 	--model_name_or_path="openai/whisper-small" \
+	--output_dir="../whisper-NPSC" \
 	--overwrite_output_dir=True \
 	--language="Norwegian" \
 	--task="transcribe" \
 	--do_train=True \
 	--do_eval=True \
 	--audio_column_name="audio" \
+	--text_column_name="sentence_nob" \
 	--per_device_train_batch_size=16 \
+    --per_device_train_batch_size=16 \
+	--learning_rate=4e-5 \
 	--warmup_steps=500 \
+	--max_steps=5000 \
 	--gradient_checkpointing=True \
 	--gradient_accumulation_steps=1 \
 	--group_by_length=False \
 	--evaluation_strategy="steps" \
 	--save_steps=1000 \
 	--eval_steps=1000 \
+	--max_eval_samples=100 \
 	--logging_steps=250 \
 	--fp16=True \
 	--load_best_model_at_end=True \
 	--print_training_arguments=True \
 	--push_to_hub=True

run_nst.sh CHANGED Viewed

@@ -1,7 +1,10 @@
 python run_whisper_finetuning.py \
 	--model_name_or_path="openai/whisper-small" \
-	--output_dir="../whisper-testrun1" \
 	--overwrite_output_dir=True \
 	--language="Norwegian" \
 	--task="transcribe" \
@@ -11,17 +14,17 @@ python run_whisper_finetuning.py \
 	--do_eval=True \
 	--audio_column_name="audio" \
 	--text_column_name="text" \
-	--per_device_train_batch_size=16 \
-       	--per_device_train_batch_size=16 \
-	--learning_rate=2e-5 \
 	--warmup_steps=500 \
-	--max_steps=10000 \
 	--gradient_checkpointing=True \
 	--gradient_accumulation_steps=1 \
 	--group_by_length=False \
 	--evaluation_strategy="steps" \
 	--save_steps=1000 \
-	--eval_steps=10 \
 	--max_eval_samples=100 \
 	--logging_steps=250 \
 	--fp16=True \

+# Whisper Finetuning script for the NST dataset
+# Currently for training on a 48GB
+# Reduce batch size and learning rate if training on smaller GPU
 python run_whisper_finetuning.py \
 	--model_name_or_path="openai/whisper-small" \
+	--output_dir="../whisper-NST" \
 	--overwrite_output_dir=True \
 	--language="Norwegian" \
 	--task="transcribe" \
 	--do_eval=True \
 	--audio_column_name="audio" \
 	--text_column_name="text" \
+	--per_device_train_batch_size=48 \
+    --per_device_train_batch_size=48 \
+	--learning_rate=4e-5 \
 	--warmup_steps=500 \
+	--max_steps=5000 \
 	--gradient_checkpointing=True \
 	--gradient_accumulation_steps=1 \
 	--group_by_length=False \
 	--evaluation_strategy="steps" \
 	--save_steps=1000 \
+	--eval_steps=1000 \
 	--max_eval_samples=100 \
 	--logging_steps=250 \
 	--fp16=True \

run_whisper_finetuning.py CHANGED Viewed

@@ -234,7 +234,7 @@ class DataTrainingArguments:
         default=None,
         metadata={
             "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
-                    "value if set. Should also be set when streaming."
         },
     )
     chars_to_ignore: Optional[List[str]] = list_field(
@@ -410,18 +410,22 @@ def main():
     # Prepare data
-    # Is not working.... but since it is already 16000 maybe I dont need it?
     # train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
     # eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
-    # TODO Not able to implement in Streaming mode. Can not find a way to list columns. But is is necessary?
     # train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, num_proc=1)
     train_dataset = train_dataset.map(prepare_dataset)
     eval_dataset = eval_dataset.map(prepare_dataset)
     # Metrics
-    metric = evaluate.load("wer")
     # Detecting last checkpoint.
     last_checkpoint = None
@@ -441,7 +445,9 @@ def main():
    # Training
     if training_args.do_train:
         # use last checkpoint if exist
         if last_checkpoint is not None:
             print("*** Found a checkpoint!")
@@ -464,21 +470,26 @@ def main():
         set_seed(training_args.seed)
         # TODO - I think the number of epochs needs to be set manually? Now it seems to be calculated based on the save steps. How do I do this?
-        # Code here
-        # Save the processor as well, since we need it later
         processor.save_pretrained(training_args.output_dir)
         trainer = Seq2SeqTrainer(
             args=training_args,
             model=model,
             train_dataset=train_dataset.with_format("torch"),
-            eval_dataset=eval_dataset.with_format("torch"),
             data_collator=data_collator,
             compute_metrics=compute_metrics,
             tokenizer=processor.feature_extractor,
         )
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         trainer.save_model()
@@ -486,6 +497,8 @@ def main():
         metrics = train_result.metrics
         trainer.log_metrics("train", metrics)
         trainer.save_metrics("train", metrics)
         trainer.save_state()
         if training_args.push_to_hub:
@@ -493,24 +506,7 @@ def main():
         else:
             trainer.create_model_card(**kwargs)
-    # TODO - Look closer into the evaluation and the model card writing.
-    # breakpoint()
-    # Evaluation
-    # results = {}
-    # if training_args.do_eval:
-    #     logger.info("*** Evaluate ***")
-    #     metrics = trainer.evaluate()
-    #     max_eval_samples = (
-    #         data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
-    #             vectorized_datasets["eval"])
-    #     )
-    #     metrics["eval_samples"] = min(
-    #         max_eval_samples, len(vectorized_datasets["eval"]))
-    #     trainer.log_metrics("eval", metrics)
-    #     trainer.save_metrics("eval", metrics)
     # Write model card and (optionally) push to hub
     config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
     kwargs = {
@@ -524,7 +520,6 @@ def main():
     return results
 # XLA hook
 def _mp_fn(index):
     # For xla_spawn (TPUs)

         default=None,
         metadata={
             "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
+                    "value if set."
         },
     )
     chars_to_ignore: Optional[List[str]] = list_field(
     # Prepare data
+    # TODO The casting of the  not working on the NPSC in 48K. It seems to be working for Common Voice
+    # The issue is that the dataset features returns None. But for me thay seem to have been set correctly
+    # In our case this is not needed, since the datasets already is available as 16K. But it would be great to solve this bug
     # train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
     # eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
+    # TODO I would really like to remove the non needed columns here. At least this cleans up the output.
+    # I am unable to figure out how to do this Streaming mode. Can not find a way to list columns.
     # train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, num_proc=1)
     train_dataset = train_dataset.map(prepare_dataset)
     eval_dataset = eval_dataset.map(prepare_dataset)
     # Metrics
+    metric = evaluate.load("wer","cer")
     # Detecting last checkpoint.
     last_checkpoint = None
    # Training
     if training_args.do_train:
+        # TODO I have not yet verified that this part works as expected. The checkpoint=None should also give a meaningful error.
+        # The script should not allow you to train a whisper from scratch...
         # use last checkpoint if exist
         if last_checkpoint is not None:
             print("*** Found a checkpoint!")
         set_seed(training_args.seed)
         # TODO - I think the number of epochs needs to be set manually? Now it seems to be calculated based on the save steps. How do I do this?
+        # This is currently the output from Trainer - The "Num Epochs" indicates the universe might end before training is finished
+        # ***** Running training *****
+        # Num examples = 480000
+        # Num Epochs = 9223372036854775807
+        # Instantaneous batch size per device = 48
+        # Saving the processor since we need it later
         processor.save_pretrained(training_args.output_dir)
+        # TODO - I can not get the max_eval_steps to run directly. I am therefore including it here. Not very elegant, but it works.
         trainer = Seq2SeqTrainer(
             args=training_args,
             model=model,
             train_dataset=train_dataset.with_format("torch"),
+            eval_dataset=eval_dataset.with_format("torch").take(data_args.max_eval_samples),
             data_collator=data_collator,
             compute_metrics=compute_metrics,
             tokenizer=processor.feature_extractor,
         )
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         trainer.save_model()
         metrics = train_result.metrics
         trainer.log_metrics("train", metrics)
         trainer.save_metrics("train", metrics)
+        # TODO What does this do? Does this also mean we can load the state? Can this be done per checkpoint?
         trainer.save_state()
         if training_args.push_to_hub:
         else:
             trainer.create_model_card(**kwargs)
+    # TODO - Look closer into the model card writing.
     # Write model card and (optionally) push to hub
     config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
     kwargs = {
     return results
 # XLA hook
 def _mp_fn(index):
     # For xla_spawn (TPUs)

run_xla.sh CHANGED Viewed

	@@ -1,4 +1,47 @@


1
2	python xla_spawn.py --num_cores=4 run_whisper_finetuining.py

































3
4

+# Whisper Finetuning script for the NST dataset
+# This is a test script for XLA on TPU
 python xla_spawn.py --num_cores=4 run_whisper_finetuining.py
+	--model_name_or_path="openai/whisper-small" \
+	--output_dir="../whisper-NST-TPU" \
+	--overwrite_output_dir=True \
+	--language="Norwegian" \
+	--task="transcribe" \
+	--dataset_name="NbAiLab/NST" \
+	--dataset_config="no-close" \
+	--do_train=True \
+	--do_eval=True \
+	--audio_column_name="audio" \
+	--text_column_name="text" \
+	--per_device_train_batch_size=16 \
+    --per_device_train_batch_size=16 \
+	--learning_rate=2e-5 \
+	--warmup_steps=500 \
+	--max_steps=5000 \
+	--gradient_checkpointing=True \
+	--gradient_accumulation_steps=1 \
+	--group_by_length=False \
+	--evaluation_strategy="steps" \
+	--save_steps=1000 \
+	--eval_steps=1000 \
+	--max_eval_samples=100 \
+	--logging_steps=250 \
+	--fp16=True \
+	--load_best_model_at_end=True \
+	--metric_for_best_model="wer" \
+	--greater_is_better=False \
+	--report_to="tensorboard" \
+	--predict_with_generate=True \
+	--generation_max_length=225 \
+	--print_training_arguments=True \
+	--push_to_hub=True
+    # Very likely that some of this parameters needs to be added
+    # tpu_name (:obj:`str`, `optional`):
+    #     The name of the TPU the process is running on.
+    # tpu_zone (:obj:`str`, `optional`):
+    #     The zone of the TPU the process is running on. If not specified, we will attempt to automatically detect
+    #     from metadata.
+    # xla (:obj:`bool`, `optional`):
+    #     Whether to activate the XLA compilation or not.