lots of minor updates
Browse files- run.sh → run_cc.sh +12 -9
- run_npsc.sh +9 -7
- run_nst.sh +9 -6
- run_whisper_finetuning.py +23 -28
- run_xla.sh +43 -0
run.sh → run_cc.sh
RENAMED
|
@@ -1,7 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
python run_whisper_finetuning.py \
|
| 3 |
--model_name_or_path="openai/whisper-small" \
|
| 4 |
-
--output_dir="../whisper-
|
| 5 |
--overwrite_output_dir=True \
|
| 6 |
--language="Norwegian" \
|
| 7 |
--task="transcribe" \
|
|
@@ -11,18 +15,18 @@ python run_whisper_finetuning.py \
|
|
| 11 |
--do_eval=True \
|
| 12 |
--audio_column_name="audio" \
|
| 13 |
--text_column_name="sentence" \
|
| 14 |
-
--per_device_train_batch_size=
|
| 15 |
-
--per_device_train_batch_size=
|
| 16 |
-
--learning_rate=
|
| 17 |
--warmup_steps=500 \
|
| 18 |
-
--max_steps=
|
| 19 |
--gradient_checkpointing=True \
|
| 20 |
--gradient_accumulation_steps=1 \
|
| 21 |
--group_by_length=False \
|
| 22 |
--evaluation_strategy="steps" \
|
| 23 |
-
--save_steps=
|
| 24 |
-
--eval_steps=
|
| 25 |
-
--max_eval_samples=
|
| 26 |
--logging_steps=250 \
|
| 27 |
--fp16=True \
|
| 28 |
--load_best_model_at_end=True \
|
|
@@ -34,5 +38,4 @@ python run_whisper_finetuning.py \
|
|
| 34 |
--print_training_arguments=True \
|
| 35 |
--push_to_hub=True
|
| 36 |
|
| 37 |
-
|
| 38 |
|
|
|
|
| 1 |
+
# Whisper Finetuning script for the very small Nynorsk Common Voice
|
| 2 |
+
# Test script. Currently this only runs if you uncomment the lines in the training code for casting this from 48K to 16K
|
| 3 |
+
# Currently for training on a 48GB
|
| 4 |
+
# Reduce batch size and learning rate if training on smaller GPU
|
| 5 |
|
| 6 |
python run_whisper_finetuning.py \
|
| 7 |
--model_name_or_path="openai/whisper-small" \
|
| 8 |
+
--output_dir="../whisper-CC-nn" \
|
| 9 |
--overwrite_output_dir=True \
|
| 10 |
--language="Norwegian" \
|
| 11 |
--task="transcribe" \
|
|
|
|
| 15 |
--do_eval=True \
|
| 16 |
--audio_column_name="audio" \
|
| 17 |
--text_column_name="sentence" \
|
| 18 |
+
--per_device_train_batch_size=48 \
|
| 19 |
+
--per_device_train_batch_size=48 \
|
| 20 |
+
--learning_rate=4e-5 \
|
| 21 |
--warmup_steps=500 \
|
| 22 |
+
--max_steps=1000 \
|
| 23 |
--gradient_checkpointing=True \
|
| 24 |
--gradient_accumulation_steps=1 \
|
| 25 |
--group_by_length=False \
|
| 26 |
--evaluation_strategy="steps" \
|
| 27 |
+
--save_steps=250 \
|
| 28 |
+
--eval_steps=250 \
|
| 29 |
+
--max_eval_samples=50 \
|
| 30 |
--logging_steps=250 \
|
| 31 |
--fp16=True \
|
| 32 |
--load_best_model_at_end=True \
|
|
|
|
| 38 |
--print_training_arguments=True \
|
| 39 |
--push_to_hub=True
|
| 40 |
|
|
|
|
| 41 |
|
run_npsc.sh
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
python run_whisper_finetuning.py \
|
| 3 |
--model_name_or_path="openai/whisper-small" \
|
| 4 |
-
--output_dir="../whisper-
|
| 5 |
--overwrite_output_dir=True \
|
| 6 |
--language="Norwegian" \
|
| 7 |
--task="transcribe" \
|
|
@@ -10,18 +13,19 @@ python run_whisper_finetuning.py \
|
|
| 10 |
--do_train=True \
|
| 11 |
--do_eval=True \
|
| 12 |
--audio_column_name="audio" \
|
| 13 |
-
--text_column_name="
|
| 14 |
--per_device_train_batch_size=16 \
|
| 15 |
-
|
| 16 |
-
--learning_rate=
|
| 17 |
--warmup_steps=500 \
|
| 18 |
-
--max_steps=
|
| 19 |
--gradient_checkpointing=True \
|
| 20 |
--gradient_accumulation_steps=1 \
|
| 21 |
--group_by_length=False \
|
| 22 |
--evaluation_strategy="steps" \
|
| 23 |
--save_steps=1000 \
|
| 24 |
--eval_steps=1000 \
|
|
|
|
| 25 |
--logging_steps=250 \
|
| 26 |
--fp16=True \
|
| 27 |
--load_best_model_at_end=True \
|
|
@@ -33,5 +37,3 @@ python run_whisper_finetuning.py \
|
|
| 33 |
--print_training_arguments=True \
|
| 34 |
--push_to_hub=True
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
|
|
|
| 1 |
+
# Whisper Finetuning script for the NPSC_orto dataset
|
| 2 |
+
# Currently for training on a 48GB
|
| 3 |
+
# Reduce batch size and learning rate if training on smaller GPU
|
| 4 |
|
| 5 |
python run_whisper_finetuning.py \
|
| 6 |
--model_name_or_path="openai/whisper-small" \
|
| 7 |
+
--output_dir="../whisper-NPSC" \
|
| 8 |
--overwrite_output_dir=True \
|
| 9 |
--language="Norwegian" \
|
| 10 |
--task="transcribe" \
|
|
|
|
| 13 |
--do_train=True \
|
| 14 |
--do_eval=True \
|
| 15 |
--audio_column_name="audio" \
|
| 16 |
+
--text_column_name="sentence_nob" \
|
| 17 |
--per_device_train_batch_size=16 \
|
| 18 |
+
--per_device_train_batch_size=16 \
|
| 19 |
+
--learning_rate=4e-5 \
|
| 20 |
--warmup_steps=500 \
|
| 21 |
+
--max_steps=5000 \
|
| 22 |
--gradient_checkpointing=True \
|
| 23 |
--gradient_accumulation_steps=1 \
|
| 24 |
--group_by_length=False \
|
| 25 |
--evaluation_strategy="steps" \
|
| 26 |
--save_steps=1000 \
|
| 27 |
--eval_steps=1000 \
|
| 28 |
+
--max_eval_samples=100 \
|
| 29 |
--logging_steps=250 \
|
| 30 |
--fp16=True \
|
| 31 |
--load_best_model_at_end=True \
|
|
|
|
| 37 |
--print_training_arguments=True \
|
| 38 |
--push_to_hub=True
|
| 39 |
|
|
|
|
|
|
run_nst.sh
CHANGED
|
@@ -1,7 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
python run_whisper_finetuning.py \
|
| 3 |
--model_name_or_path="openai/whisper-small" \
|
| 4 |
-
--output_dir="../whisper-
|
| 5 |
--overwrite_output_dir=True \
|
| 6 |
--language="Norwegian" \
|
| 7 |
--task="transcribe" \
|
|
@@ -11,17 +14,17 @@ python run_whisper_finetuning.py \
|
|
| 11 |
--do_eval=True \
|
| 12 |
--audio_column_name="audio" \
|
| 13 |
--text_column_name="text" \
|
| 14 |
-
--per_device_train_batch_size=
|
| 15 |
-
|
| 16 |
-
--learning_rate=
|
| 17 |
--warmup_steps=500 \
|
| 18 |
-
--max_steps=
|
| 19 |
--gradient_checkpointing=True \
|
| 20 |
--gradient_accumulation_steps=1 \
|
| 21 |
--group_by_length=False \
|
| 22 |
--evaluation_strategy="steps" \
|
| 23 |
--save_steps=1000 \
|
| 24 |
-
--eval_steps=
|
| 25 |
--max_eval_samples=100 \
|
| 26 |
--logging_steps=250 \
|
| 27 |
--fp16=True \
|
|
|
|
| 1 |
+
# Whisper Finetuning script for the NST dataset
|
| 2 |
+
# Currently for training on a 48GB
|
| 3 |
+
# Reduce batch size and learning rate if training on smaller GPU
|
| 4 |
|
| 5 |
python run_whisper_finetuning.py \
|
| 6 |
--model_name_or_path="openai/whisper-small" \
|
| 7 |
+
--output_dir="../whisper-NST" \
|
| 8 |
--overwrite_output_dir=True \
|
| 9 |
--language="Norwegian" \
|
| 10 |
--task="transcribe" \
|
|
|
|
| 14 |
--do_eval=True \
|
| 15 |
--audio_column_name="audio" \
|
| 16 |
--text_column_name="text" \
|
| 17 |
+
--per_device_train_batch_size=48 \
|
| 18 |
+
--per_device_train_batch_size=48 \
|
| 19 |
+
--learning_rate=4e-5 \
|
| 20 |
--warmup_steps=500 \
|
| 21 |
+
--max_steps=5000 \
|
| 22 |
--gradient_checkpointing=True \
|
| 23 |
--gradient_accumulation_steps=1 \
|
| 24 |
--group_by_length=False \
|
| 25 |
--evaluation_strategy="steps" \
|
| 26 |
--save_steps=1000 \
|
| 27 |
+
--eval_steps=1000 \
|
| 28 |
--max_eval_samples=100 \
|
| 29 |
--logging_steps=250 \
|
| 30 |
--fp16=True \
|
run_whisper_finetuning.py
CHANGED
|
@@ -234,7 +234,7 @@ class DataTrainingArguments:
|
|
| 234 |
default=None,
|
| 235 |
metadata={
|
| 236 |
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
|
| 237 |
-
"value if set.
|
| 238 |
},
|
| 239 |
)
|
| 240 |
chars_to_ignore: Optional[List[str]] = list_field(
|
|
@@ -410,18 +410,22 @@ def main():
|
|
| 410 |
|
| 411 |
|
| 412 |
# Prepare data
|
| 413 |
-
#
|
|
|
|
|
|
|
| 414 |
# train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
| 415 |
# eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
| 416 |
|
| 417 |
-
|
|
|
|
|
|
|
| 418 |
# train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, num_proc=1)
|
| 419 |
|
| 420 |
train_dataset = train_dataset.map(prepare_dataset)
|
| 421 |
eval_dataset = eval_dataset.map(prepare_dataset)
|
| 422 |
|
| 423 |
# Metrics
|
| 424 |
-
metric = evaluate.load("wer")
|
| 425 |
|
| 426 |
# Detecting last checkpoint.
|
| 427 |
last_checkpoint = None
|
|
@@ -441,7 +445,9 @@ def main():
|
|
| 441 |
|
| 442 |
# Training
|
| 443 |
if training_args.do_train:
|
| 444 |
-
|
|
|
|
|
|
|
| 445 |
# use last checkpoint if exist
|
| 446 |
if last_checkpoint is not None:
|
| 447 |
print("*** Found a checkpoint!")
|
|
@@ -464,21 +470,26 @@ def main():
|
|
| 464 |
set_seed(training_args.seed)
|
| 465 |
|
| 466 |
# TODO - I think the number of epochs needs to be set manually? Now it seems to be calculated based on the save steps. How do I do this?
|
| 467 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
-
#
|
| 470 |
processor.save_pretrained(training_args.output_dir)
|
| 471 |
|
|
|
|
|
|
|
| 472 |
trainer = Seq2SeqTrainer(
|
| 473 |
args=training_args,
|
| 474 |
model=model,
|
| 475 |
train_dataset=train_dataset.with_format("torch"),
|
| 476 |
-
eval_dataset=eval_dataset.with_format("torch"),
|
| 477 |
data_collator=data_collator,
|
| 478 |
compute_metrics=compute_metrics,
|
| 479 |
tokenizer=processor.feature_extractor,
|
| 480 |
)
|
| 481 |
-
|
| 482 |
|
| 483 |
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
| 484 |
trainer.save_model()
|
|
@@ -486,6 +497,8 @@ def main():
|
|
| 486 |
metrics = train_result.metrics
|
| 487 |
trainer.log_metrics("train", metrics)
|
| 488 |
trainer.save_metrics("train", metrics)
|
|
|
|
|
|
|
| 489 |
trainer.save_state()
|
| 490 |
|
| 491 |
if training_args.push_to_hub:
|
|
@@ -493,24 +506,7 @@ def main():
|
|
| 493 |
else:
|
| 494 |
trainer.create_model_card(**kwargs)
|
| 495 |
|
| 496 |
-
# TODO - Look closer into the
|
| 497 |
-
|
| 498 |
-
# breakpoint()
|
| 499 |
-
# Evaluation
|
| 500 |
-
# results = {}
|
| 501 |
-
# if training_args.do_eval:
|
| 502 |
-
# logger.info("*** Evaluate ***")
|
| 503 |
-
# metrics = trainer.evaluate()
|
| 504 |
-
# max_eval_samples = (
|
| 505 |
-
# data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
|
| 506 |
-
# vectorized_datasets["eval"])
|
| 507 |
-
# )
|
| 508 |
-
# metrics["eval_samples"] = min(
|
| 509 |
-
# max_eval_samples, len(vectorized_datasets["eval"]))
|
| 510 |
-
|
| 511 |
-
# trainer.log_metrics("eval", metrics)
|
| 512 |
-
# trainer.save_metrics("eval", metrics)
|
| 513 |
-
|
| 514 |
# Write model card and (optionally) push to hub
|
| 515 |
config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
|
| 516 |
kwargs = {
|
|
@@ -524,7 +520,6 @@ def main():
|
|
| 524 |
|
| 525 |
return results
|
| 526 |
|
| 527 |
-
|
| 528 |
# XLA hook
|
| 529 |
def _mp_fn(index):
|
| 530 |
# For xla_spawn (TPUs)
|
|
|
|
| 234 |
default=None,
|
| 235 |
metadata={
|
| 236 |
"help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
|
| 237 |
+
"value if set."
|
| 238 |
},
|
| 239 |
)
|
| 240 |
chars_to_ignore: Optional[List[str]] = list_field(
|
|
|
|
| 410 |
|
| 411 |
|
| 412 |
# Prepare data
|
| 413 |
+
# TODO The casting of the not working on the NPSC in 48K. It seems to be working for Common Voice
|
| 414 |
+
# The issue is that the dataset features returns None. But for me thay seem to have been set correctly
|
| 415 |
+
# In our case this is not needed, since the datasets already is available as 16K. But it would be great to solve this bug
|
| 416 |
# train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
| 417 |
# eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
|
| 418 |
|
| 419 |
+
|
| 420 |
+
# TODO I would really like to remove the non needed columns here. At least this cleans up the output.
|
| 421 |
+
# I am unable to figure out how to do this Streaming mode. Can not find a way to list columns.
|
| 422 |
# train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, num_proc=1)
|
| 423 |
|
| 424 |
train_dataset = train_dataset.map(prepare_dataset)
|
| 425 |
eval_dataset = eval_dataset.map(prepare_dataset)
|
| 426 |
|
| 427 |
# Metrics
|
| 428 |
+
metric = evaluate.load("wer","cer")
|
| 429 |
|
| 430 |
# Detecting last checkpoint.
|
| 431 |
last_checkpoint = None
|
|
|
|
| 445 |
|
| 446 |
# Training
|
| 447 |
if training_args.do_train:
|
| 448 |
+
# TODO I have not yet verified that this part works as expected. The checkpoint=None should also give a meaningful error.
|
| 449 |
+
# The script should not allow you to train a whisper from scratch...
|
| 450 |
+
|
| 451 |
# use last checkpoint if exist
|
| 452 |
if last_checkpoint is not None:
|
| 453 |
print("*** Found a checkpoint!")
|
|
|
|
| 470 |
set_seed(training_args.seed)
|
| 471 |
|
| 472 |
# TODO - I think the number of epochs needs to be set manually? Now it seems to be calculated based on the save steps. How do I do this?
|
| 473 |
+
# This is currently the output from Trainer - The "Num Epochs" indicates the universe might end before training is finished
|
| 474 |
+
# ***** Running training *****
|
| 475 |
+
# Num examples = 480000
|
| 476 |
+
# Num Epochs = 9223372036854775807
|
| 477 |
+
# Instantaneous batch size per device = 48
|
| 478 |
|
| 479 |
+
# Saving the processor since we need it later
|
| 480 |
processor.save_pretrained(training_args.output_dir)
|
| 481 |
|
| 482 |
+
|
| 483 |
+
# TODO - I can not get the max_eval_steps to run directly. I am therefore including it here. Not very elegant, but it works.
|
| 484 |
trainer = Seq2SeqTrainer(
|
| 485 |
args=training_args,
|
| 486 |
model=model,
|
| 487 |
train_dataset=train_dataset.with_format("torch"),
|
| 488 |
+
eval_dataset=eval_dataset.with_format("torch").take(data_args.max_eval_samples),
|
| 489 |
data_collator=data_collator,
|
| 490 |
compute_metrics=compute_metrics,
|
| 491 |
tokenizer=processor.feature_extractor,
|
| 492 |
)
|
|
|
|
| 493 |
|
| 494 |
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
| 495 |
trainer.save_model()
|
|
|
|
| 497 |
metrics = train_result.metrics
|
| 498 |
trainer.log_metrics("train", metrics)
|
| 499 |
trainer.save_metrics("train", metrics)
|
| 500 |
+
|
| 501 |
+
# TODO What does this do? Does this also mean we can load the state? Can this be done per checkpoint?
|
| 502 |
trainer.save_state()
|
| 503 |
|
| 504 |
if training_args.push_to_hub:
|
|
|
|
| 506 |
else:
|
| 507 |
trainer.create_model_card(**kwargs)
|
| 508 |
|
| 509 |
+
# TODO - Look closer into the model card writing.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 510 |
# Write model card and (optionally) push to hub
|
| 511 |
config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
|
| 512 |
kwargs = {
|
|
|
|
| 520 |
|
| 521 |
return results
|
| 522 |
|
|
|
|
| 523 |
# XLA hook
|
| 524 |
def _mp_fn(index):
|
| 525 |
# For xla_spawn (TPUs)
|
run_xla.sh
CHANGED
|
@@ -1,4 +1,47 @@
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
python xla_spawn.py --num_cores=4 run_whisper_finetuining.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Whisper Finetuning script for the NST dataset
|
| 2 |
+
# This is a test script for XLA on TPU
|
| 3 |
|
| 4 |
python xla_spawn.py --num_cores=4 run_whisper_finetuining.py
|
| 5 |
+
--model_name_or_path="openai/whisper-small" \
|
| 6 |
+
--output_dir="../whisper-NST-TPU" \
|
| 7 |
+
--overwrite_output_dir=True \
|
| 8 |
+
--language="Norwegian" \
|
| 9 |
+
--task="transcribe" \
|
| 10 |
+
--dataset_name="NbAiLab/NST" \
|
| 11 |
+
--dataset_config="no-close" \
|
| 12 |
+
--do_train=True \
|
| 13 |
+
--do_eval=True \
|
| 14 |
+
--audio_column_name="audio" \
|
| 15 |
+
--text_column_name="text" \
|
| 16 |
+
--per_device_train_batch_size=16 \
|
| 17 |
+
--per_device_train_batch_size=16 \
|
| 18 |
+
--learning_rate=2e-5 \
|
| 19 |
+
--warmup_steps=500 \
|
| 20 |
+
--max_steps=5000 \
|
| 21 |
+
--gradient_checkpointing=True \
|
| 22 |
+
--gradient_accumulation_steps=1 \
|
| 23 |
+
--group_by_length=False \
|
| 24 |
+
--evaluation_strategy="steps" \
|
| 25 |
+
--save_steps=1000 \
|
| 26 |
+
--eval_steps=1000 \
|
| 27 |
+
--max_eval_samples=100 \
|
| 28 |
+
--logging_steps=250 \
|
| 29 |
+
--fp16=True \
|
| 30 |
+
--load_best_model_at_end=True \
|
| 31 |
+
--metric_for_best_model="wer" \
|
| 32 |
+
--greater_is_better=False \
|
| 33 |
+
--report_to="tensorboard" \
|
| 34 |
+
--predict_with_generate=True \
|
| 35 |
+
--generation_max_length=225 \
|
| 36 |
+
--print_training_arguments=True \
|
| 37 |
+
--push_to_hub=True
|
| 38 |
|
| 39 |
|
| 40 |
+
# Very likely that some of this parameters needs to be added
|
| 41 |
+
# tpu_name (:obj:`str`, `optional`):
|
| 42 |
+
# The name of the TPU the process is running on.
|
| 43 |
+
# tpu_zone (:obj:`str`, `optional`):
|
| 44 |
+
# The zone of the TPU the process is running on. If not specified, we will attempt to automatically detect
|
| 45 |
+
# from metadata.
|
| 46 |
+
# xla (:obj:`bool`, `optional`):
|
| 47 |
+
# Whether to activate the XLA compilation or not.
|