pere commited on
Commit
50a96f5
·
1 Parent(s): 1cf8cf6

lots of minor updates

Browse files
Files changed (5) hide show
  1. run.sh → run_cc.sh +12 -9
  2. run_npsc.sh +9 -7
  3. run_nst.sh +9 -6
  4. run_whisper_finetuning.py +23 -28
  5. run_xla.sh +43 -0
run.sh → run_cc.sh RENAMED
@@ -1,7 +1,11 @@
 
 
 
 
1
 
2
  python run_whisper_finetuning.py \
3
  --model_name_or_path="openai/whisper-small" \
4
- --output_dir="../whisper-testrun1" \
5
  --overwrite_output_dir=True \
6
  --language="Norwegian" \
7
  --task="transcribe" \
@@ -11,18 +15,18 @@ python run_whisper_finetuning.py \
11
  --do_eval=True \
12
  --audio_column_name="audio" \
13
  --text_column_name="sentence" \
14
- --per_device_train_batch_size=32 \
15
- --per_device_train_batch_size=32 \
16
- --learning_rate=2e-5 \
17
  --warmup_steps=500 \
18
- --max_steps=10000 \
19
  --gradient_checkpointing=True \
20
  --gradient_accumulation_steps=1 \
21
  --group_by_length=False \
22
  --evaluation_strategy="steps" \
23
- --save_steps=1000 \
24
- --eval_steps=1000 \
25
- --max_eval_samples=10 \
26
  --logging_steps=250 \
27
  --fp16=True \
28
  --load_best_model_at_end=True \
@@ -34,5 +38,4 @@ python run_whisper_finetuning.py \
34
  --print_training_arguments=True \
35
  --push_to_hub=True
36
 
37
-
38
 
 
1
+ # Whisper Finetuning script for the very small Nynorsk Common Voice
2
+ # Test script. Currently this only runs if you uncomment the lines in the training code for casting this from 48K to 16K
3
+ # Currently for training on a 48GB
4
+ # Reduce batch size and learning rate if training on smaller GPU
5
 
6
  python run_whisper_finetuning.py \
7
  --model_name_or_path="openai/whisper-small" \
8
+ --output_dir="../whisper-CC-nn" \
9
  --overwrite_output_dir=True \
10
  --language="Norwegian" \
11
  --task="transcribe" \
 
15
  --do_eval=True \
16
  --audio_column_name="audio" \
17
  --text_column_name="sentence" \
18
+ --per_device_train_batch_size=48 \
19
+ --per_device_train_batch_size=48 \
20
+ --learning_rate=4e-5 \
21
  --warmup_steps=500 \
22
+ --max_steps=1000 \
23
  --gradient_checkpointing=True \
24
  --gradient_accumulation_steps=1 \
25
  --group_by_length=False \
26
  --evaluation_strategy="steps" \
27
+ --save_steps=250 \
28
+ --eval_steps=250 \
29
+ --max_eval_samples=50 \
30
  --logging_steps=250 \
31
  --fp16=True \
32
  --load_best_model_at_end=True \
 
38
  --print_training_arguments=True \
39
  --push_to_hub=True
40
 
 
41
 
run_npsc.sh CHANGED
@@ -1,7 +1,10 @@
 
 
 
1
 
2
  python run_whisper_finetuning.py \
3
  --model_name_or_path="openai/whisper-small" \
4
- --output_dir="../whisper-testrun1" \
5
  --overwrite_output_dir=True \
6
  --language="Norwegian" \
7
  --task="transcribe" \
@@ -10,18 +13,19 @@ python run_whisper_finetuning.py \
10
  --do_train=True \
11
  --do_eval=True \
12
  --audio_column_name="audio" \
13
- --text_column_name="text" \
14
  --per_device_train_batch_size=16 \
15
- --per_device_train_batch_size=16 \
16
- --learning_rate=2e-5 \
17
  --warmup_steps=500 \
18
- --max_steps=10000 \
19
  --gradient_checkpointing=True \
20
  --gradient_accumulation_steps=1 \
21
  --group_by_length=False \
22
  --evaluation_strategy="steps" \
23
  --save_steps=1000 \
24
  --eval_steps=1000 \
 
25
  --logging_steps=250 \
26
  --fp16=True \
27
  --load_best_model_at_end=True \
@@ -33,5 +37,3 @@ python run_whisper_finetuning.py \
33
  --print_training_arguments=True \
34
  --push_to_hub=True
35
 
36
-
37
-
 
1
+ # Whisper Finetuning script for the NPSC_orto dataset
2
+ # Currently for training on a 48GB
3
+ # Reduce batch size and learning rate if training on smaller GPU
4
 
5
  python run_whisper_finetuning.py \
6
  --model_name_or_path="openai/whisper-small" \
7
+ --output_dir="../whisper-NPSC" \
8
  --overwrite_output_dir=True \
9
  --language="Norwegian" \
10
  --task="transcribe" \
 
13
  --do_train=True \
14
  --do_eval=True \
15
  --audio_column_name="audio" \
16
+ --text_column_name="sentence_nob" \
17
  --per_device_train_batch_size=16 \
18
+ --per_device_train_batch_size=16 \
19
+ --learning_rate=4e-5 \
20
  --warmup_steps=500 \
21
+ --max_steps=5000 \
22
  --gradient_checkpointing=True \
23
  --gradient_accumulation_steps=1 \
24
  --group_by_length=False \
25
  --evaluation_strategy="steps" \
26
  --save_steps=1000 \
27
  --eval_steps=1000 \
28
+ --max_eval_samples=100 \
29
  --logging_steps=250 \
30
  --fp16=True \
31
  --load_best_model_at_end=True \
 
37
  --print_training_arguments=True \
38
  --push_to_hub=True
39
 
 
 
run_nst.sh CHANGED
@@ -1,7 +1,10 @@
 
 
 
1
 
2
  python run_whisper_finetuning.py \
3
  --model_name_or_path="openai/whisper-small" \
4
- --output_dir="../whisper-testrun1" \
5
  --overwrite_output_dir=True \
6
  --language="Norwegian" \
7
  --task="transcribe" \
@@ -11,17 +14,17 @@ python run_whisper_finetuning.py \
11
  --do_eval=True \
12
  --audio_column_name="audio" \
13
  --text_column_name="text" \
14
- --per_device_train_batch_size=16 \
15
- --per_device_train_batch_size=16 \
16
- --learning_rate=2e-5 \
17
  --warmup_steps=500 \
18
- --max_steps=10000 \
19
  --gradient_checkpointing=True \
20
  --gradient_accumulation_steps=1 \
21
  --group_by_length=False \
22
  --evaluation_strategy="steps" \
23
  --save_steps=1000 \
24
- --eval_steps=10 \
25
  --max_eval_samples=100 \
26
  --logging_steps=250 \
27
  --fp16=True \
 
1
+ # Whisper Finetuning script for the NST dataset
2
+ # Currently for training on a 48GB
3
+ # Reduce batch size and learning rate if training on smaller GPU
4
 
5
  python run_whisper_finetuning.py \
6
  --model_name_or_path="openai/whisper-small" \
7
+ --output_dir="../whisper-NST" \
8
  --overwrite_output_dir=True \
9
  --language="Norwegian" \
10
  --task="transcribe" \
 
14
  --do_eval=True \
15
  --audio_column_name="audio" \
16
  --text_column_name="text" \
17
+ --per_device_train_batch_size=48 \
18
+ --per_device_train_batch_size=48 \
19
+ --learning_rate=4e-5 \
20
  --warmup_steps=500 \
21
+ --max_steps=5000 \
22
  --gradient_checkpointing=True \
23
  --gradient_accumulation_steps=1 \
24
  --group_by_length=False \
25
  --evaluation_strategy="steps" \
26
  --save_steps=1000 \
27
+ --eval_steps=1000 \
28
  --max_eval_samples=100 \
29
  --logging_steps=250 \
30
  --fp16=True \
run_whisper_finetuning.py CHANGED
@@ -234,7 +234,7 @@ class DataTrainingArguments:
234
  default=None,
235
  metadata={
236
  "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
237
- "value if set. Should also be set when streaming."
238
  },
239
  )
240
  chars_to_ignore: Optional[List[str]] = list_field(
@@ -410,18 +410,22 @@ def main():
410
 
411
 
412
  # Prepare data
413
- # Is not working.... but since it is already 16000 maybe I dont need it?
 
 
414
  # train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
415
  # eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
416
 
417
- # TODO Not able to implement in Streaming mode. Can not find a way to list columns. But is is necessary?
 
 
418
  # train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, num_proc=1)
419
 
420
  train_dataset = train_dataset.map(prepare_dataset)
421
  eval_dataset = eval_dataset.map(prepare_dataset)
422
 
423
  # Metrics
424
- metric = evaluate.load("wer")
425
 
426
  # Detecting last checkpoint.
427
  last_checkpoint = None
@@ -441,7 +445,9 @@ def main():
441
 
442
  # Training
443
  if training_args.do_train:
444
-
 
 
445
  # use last checkpoint if exist
446
  if last_checkpoint is not None:
447
  print("*** Found a checkpoint!")
@@ -464,21 +470,26 @@ def main():
464
  set_seed(training_args.seed)
465
 
466
  # TODO - I think the number of epochs needs to be set manually? Now it seems to be calculated based on the save steps. How do I do this?
467
- # Code here
 
 
 
 
468
 
469
- # Save the processor as well, since we need it later
470
  processor.save_pretrained(training_args.output_dir)
471
 
 
 
472
  trainer = Seq2SeqTrainer(
473
  args=training_args,
474
  model=model,
475
  train_dataset=train_dataset.with_format("torch"),
476
- eval_dataset=eval_dataset.with_format("torch"),
477
  data_collator=data_collator,
478
  compute_metrics=compute_metrics,
479
  tokenizer=processor.feature_extractor,
480
  )
481
-
482
 
483
  train_result = trainer.train(resume_from_checkpoint=checkpoint)
484
  trainer.save_model()
@@ -486,6 +497,8 @@ def main():
486
  metrics = train_result.metrics
487
  trainer.log_metrics("train", metrics)
488
  trainer.save_metrics("train", metrics)
 
 
489
  trainer.save_state()
490
 
491
  if training_args.push_to_hub:
@@ -493,24 +506,7 @@ def main():
493
  else:
494
  trainer.create_model_card(**kwargs)
495
 
496
- # TODO - Look closer into the evaluation and the model card writing.
497
-
498
- # breakpoint()
499
- # Evaluation
500
- # results = {}
501
- # if training_args.do_eval:
502
- # logger.info("*** Evaluate ***")
503
- # metrics = trainer.evaluate()
504
- # max_eval_samples = (
505
- # data_args.max_eval_samples if data_args.max_eval_samples is not None else len(
506
- # vectorized_datasets["eval"])
507
- # )
508
- # metrics["eval_samples"] = min(
509
- # max_eval_samples, len(vectorized_datasets["eval"]))
510
-
511
- # trainer.log_metrics("eval", metrics)
512
- # trainer.save_metrics("eval", metrics)
513
-
514
  # Write model card and (optionally) push to hub
515
  config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
516
  kwargs = {
@@ -524,7 +520,6 @@ def main():
524
 
525
  return results
526
 
527
-
528
  # XLA hook
529
  def _mp_fn(index):
530
  # For xla_spawn (TPUs)
 
234
  default=None,
235
  metadata={
236
  "help": "For debugging purposes or quicker training, truncate the number of validation examples to this "
237
+ "value if set."
238
  },
239
  )
240
  chars_to_ignore: Optional[List[str]] = list_field(
 
410
 
411
 
412
  # Prepare data
413
+ # TODO The casting of the not working on the NPSC in 48K. It seems to be working for Common Voice
414
+ # The issue is that the dataset features returns None. But for me thay seem to have been set correctly
415
+ # In our case this is not needed, since the datasets already is available as 16K. But it would be great to solve this bug
416
  # train_dataset = train_dataset.cast_column("audio", Audio(sampling_rate=16000))
417
  # eval_dataset = eval_dataset.cast_column("audio", Audio(sampling_rate=16000))
418
 
419
+
420
+ # TODO I would really like to remove the non needed columns here. At least this cleans up the output.
421
+ # I am unable to figure out how to do this Streaming mode. Can not find a way to list columns.
422
  # train_data = train_data.map(prepare_dataset, remove_columns=train_data.column_names, num_proc=1)
423
 
424
  train_dataset = train_dataset.map(prepare_dataset)
425
  eval_dataset = eval_dataset.map(prepare_dataset)
426
 
427
  # Metrics
428
+ metric = evaluate.load("wer","cer")
429
 
430
  # Detecting last checkpoint.
431
  last_checkpoint = None
 
445
 
446
  # Training
447
  if training_args.do_train:
448
+ # TODO I have not yet verified that this part works as expected. The checkpoint=None should also give a meaningful error.
449
+ # The script should not allow you to train a whisper from scratch...
450
+
451
  # use last checkpoint if exist
452
  if last_checkpoint is not None:
453
  print("*** Found a checkpoint!")
 
470
  set_seed(training_args.seed)
471
 
472
  # TODO - I think the number of epochs needs to be set manually? Now it seems to be calculated based on the save steps. How do I do this?
473
+ # This is currently the output from Trainer - The "Num Epochs" indicates the universe might end before training is finished
474
+ # ***** Running training *****
475
+ # Num examples = 480000
476
+ # Num Epochs = 9223372036854775807
477
+ # Instantaneous batch size per device = 48
478
 
479
+ # Saving the processor since we need it later
480
  processor.save_pretrained(training_args.output_dir)
481
 
482
+
483
+ # TODO - I can not get the max_eval_steps to run directly. I am therefore including it here. Not very elegant, but it works.
484
  trainer = Seq2SeqTrainer(
485
  args=training_args,
486
  model=model,
487
  train_dataset=train_dataset.with_format("torch"),
488
+ eval_dataset=eval_dataset.with_format("torch").take(data_args.max_eval_samples),
489
  data_collator=data_collator,
490
  compute_metrics=compute_metrics,
491
  tokenizer=processor.feature_extractor,
492
  )
 
493
 
494
  train_result = trainer.train(resume_from_checkpoint=checkpoint)
495
  trainer.save_model()
 
497
  metrics = train_result.metrics
498
  trainer.log_metrics("train", metrics)
499
  trainer.save_metrics("train", metrics)
500
+
501
+ # TODO What does this do? Does this also mean we can load the state? Can this be done per checkpoint?
502
  trainer.save_state()
503
 
504
  if training_args.push_to_hub:
 
506
  else:
507
  trainer.create_model_card(**kwargs)
508
 
509
+ # TODO - Look closer into the model card writing.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
510
  # Write model card and (optionally) push to hub
511
  config_name = data_args.dataset_config_name if data_args.dataset_config_name is not None else "na"
512
  kwargs = {
 
520
 
521
  return results
522
 
 
523
  # XLA hook
524
  def _mp_fn(index):
525
  # For xla_spawn (TPUs)
run_xla.sh CHANGED
@@ -1,4 +1,47 @@
 
 
1
 
2
  python xla_spawn.py --num_cores=4 run_whisper_finetuining.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
 
 
 
 
 
 
 
 
 
1
+ # Whisper Finetuning script for the NST dataset
2
+ # This is a test script for XLA on TPU
3
 
4
  python xla_spawn.py --num_cores=4 run_whisper_finetuining.py
5
+ --model_name_or_path="openai/whisper-small" \
6
+ --output_dir="../whisper-NST-TPU" \
7
+ --overwrite_output_dir=True \
8
+ --language="Norwegian" \
9
+ --task="transcribe" \
10
+ --dataset_name="NbAiLab/NST" \
11
+ --dataset_config="no-close" \
12
+ --do_train=True \
13
+ --do_eval=True \
14
+ --audio_column_name="audio" \
15
+ --text_column_name="text" \
16
+ --per_device_train_batch_size=16 \
17
+ --per_device_train_batch_size=16 \
18
+ --learning_rate=2e-5 \
19
+ --warmup_steps=500 \
20
+ --max_steps=5000 \
21
+ --gradient_checkpointing=True \
22
+ --gradient_accumulation_steps=1 \
23
+ --group_by_length=False \
24
+ --evaluation_strategy="steps" \
25
+ --save_steps=1000 \
26
+ --eval_steps=1000 \
27
+ --max_eval_samples=100 \
28
+ --logging_steps=250 \
29
+ --fp16=True \
30
+ --load_best_model_at_end=True \
31
+ --metric_for_best_model="wer" \
32
+ --greater_is_better=False \
33
+ --report_to="tensorboard" \
34
+ --predict_with_generate=True \
35
+ --generation_max_length=225 \
36
+ --print_training_arguments=True \
37
+ --push_to_hub=True
38
 
39
 
40
+ # Very likely that some of this parameters needs to be added
41
+ # tpu_name (:obj:`str`, `optional`):
42
+ # The name of the TPU the process is running on.
43
+ # tpu_zone (:obj:`str`, `optional`):
44
+ # The zone of the TPU the process is running on. If not specified, we will attempt to automatically detect
45
+ # from metadata.
46
+ # xla (:obj:`bool`, `optional`):
47
+ # Whether to activate the XLA compilation or not.