Kit-Lemonfoot commited on
Commit
02c190e
1 Parent(s): 7989be0

Rearranged a lot for the sake of client memory optimization.

Browse files
Files changed (1) hide show
  1. app.py +154 -128
app.py CHANGED
@@ -406,6 +406,77 @@ if __name__ == '__main__':
406
  load_hubert()
407
  categories = load_model()
408
  voices = list(language_dict.keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
409
  with gr.Blocks(theme=gr.themes.Base()) as app:
410
  gr.Markdown(
411
  "# <center> VTuber RVC Models\n"
@@ -433,81 +504,6 @@ if __name__ == '__main__':
433
  (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
434
  '</div>'
435
  )
436
- with gr.Row():
437
- with gr.Column():
438
- vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Edge-TTS")
439
- # Input and Upload
440
- vc_input = gr.Textbox(label="Input audio path", visible=False)
441
- vc_upload = gr.Audio(label="Upload audio file", visible=False, interactive=True)
442
- # Youtube
443
- vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
444
- vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
445
- vc_split_model = gr.Dropdown(label="Splitter Model", choices=["htdemucs", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
446
- vc_split = gr.Button("Split Audio", variant="primary", visible=False)
447
- vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
448
- vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
449
- vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
450
- # TTS
451
- tts_text = gr.Textbox(visible=True, label="TTS text", info="Text to speech input (There is a limit of 250 characters)", interactive=True)
452
- tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=True, allow_custom_value=False, value="English-Ana (Female)", interactive=True)
453
- # Record Own
454
- record_button = gr.Audio(source="microphone", label="Record your own audio", visible=False, interactive=True)
455
- with gr.Column():
456
- vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
457
- f0method0 = gr.Radio(
458
- label="Pitch extraction algorithm",
459
- info=f0method_info,
460
- choices=f0method_mode,
461
- value="rmvpe",
462
- interactive=True
463
- )
464
- if model_index:
465
- index_rate1 = gr.Slider(
466
- minimum=0,
467
- maximum=1,
468
- label="Retrieval feature ratio",
469
- info="Accent control. Too high will usually sound too robotic. (Default: 0.4)",
470
- value=0.4,
471
- interactive=True,
472
- )
473
- else:
474
- index_rate1 = gr.Number(value=0.4, interactive=False, visible=False)
475
- with gr.Accordion("Advanced Options", open=False):
476
- filter_radius0 = gr.Slider(
477
- minimum=0,
478
- maximum=7,
479
- label="Apply Median Filtering",
480
- info="The value represents the filter radius and can reduce breathiness.",
481
- value=1,
482
- step=1,
483
- interactive=True,
484
- )
485
- resample_sr0 = gr.Slider(
486
- minimum=0,
487
- maximum=48000,
488
- label="Resample the output audio",
489
- info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling.",
490
- value=0,
491
- step=1,
492
- interactive=True,
493
- )
494
- rms_mix_rate0 = gr.Slider(
495
- minimum=0,
496
- maximum=1,
497
- label="Volume Envelope",
498
- info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
499
- value=1,
500
- interactive=True,
501
- )
502
- protect0 = gr.Slider(
503
- minimum=0,
504
- maximum=0.5,
505
- label="Voice Protection",
506
- info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
507
- value=0.23,
508
- step=0.01,
509
- interactive=True,
510
- )
511
  with gr.Column():
512
  vc_log = gr.Textbox(label="Output Information", interactive=False)
513
  vc_output = gr.Audio(label="Output Audio", interactive=False)
@@ -516,6 +512,30 @@ if __name__ == '__main__':
516
  vc_mp = gr.Textbox(value=model_path, visible=False, interactive=False)
517
  vc_mi = gr.Textbox(value=model_index, visible=False, interactive=False)
518
  vc_convert = gr.Button("Convert", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  vc_volume = gr.Slider(
520
  minimum=0,
521
  maximum=10,
@@ -529,59 +549,65 @@ if __name__ == '__main__':
529
  vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
530
  vc_combine = gr.Button("Combine",variant="primary", visible=False)
531
 
532
- vc_convert.click(
533
- fn=infer,
534
- inputs=[
535
- vc_name,
536
- vc_mp,
537
- vc_mi,
538
- vc_audio_mode,
539
- vc_input,
540
- vc_upload,
541
- tts_text,
542
- tts_voice,
543
- vc_transform0,
544
- f0method0,
545
- index_rate1,
546
- filter_radius0,
547
- resample_sr0,
548
- rms_mix_rate0,
549
- protect0,
550
- record_button
551
- ],
552
- outputs=[vc_log, vc_output]
553
- )
554
- vc_split.click(
555
- fn=cut_vocal_and_inst,
556
- inputs=[vc_link, vc_download_audio, vc_split_model],
557
- outputs=[vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input]
558
- )
559
- vc_combine.click(
560
- fn=combine_vocal_and_inst,
561
- inputs=[vc_output, vc_volume, vc_split_model],
562
- outputs=[vc_combined_output]
563
- )
564
- vc_audio_mode.change(
565
- fn=change_audio_mode,
566
- inputs=[vc_audio_mode],
567
- outputs=[
568
- vc_input,
569
- vc_upload,
570
- vc_download_audio,
571
- vc_link,
572
- vc_split_model,
573
- vc_split,
574
- vc_vocal_preview,
575
- vc_inst_preview,
576
- vc_audio_preview,
577
- vc_volume,
578
- vc_combined_output,
579
- vc_combine,
580
- tts_text,
581
- tts_voice,
582
- record_button
583
- ]
584
- )
 
 
 
 
 
 
585
  gr.Markdown(
586
  "## <center>Credit to:\n"
587
  "#### <center>Original devs:\n"
 
406
  load_hubert()
407
  categories = load_model()
408
  voices = list(language_dict.keys())
409
+
410
+ #Gradio preloading
411
+ vc_audio_mode = gr.Dropdown(label="Input voice", choices=audio_mode, allow_custom_value=False, value="Edge-TTS")
412
+ # Input and Upload
413
+ vc_input = gr.Textbox(label="Input audio path", visible=False)
414
+ vc_upload = gr.Audio(label="Upload audio file", visible=False, interactive=True)
415
+ # Youtube
416
+ vc_download_audio = gr.Dropdown(label="Provider", choices=["Youtube"], allow_custom_value=False, visible=False, value="Youtube", info="Select provider (Default: Youtube)")
417
+ vc_link = gr.Textbox(label="Youtube URL", visible=False, info="Example: https://www.youtube.com/watch?v=Nc0sB1Bmf-A", placeholder="https://www.youtube.com/watch?v=...")
418
+ vc_split_model = gr.Dropdown(label="Splitter Model", choices=["htdemucs", "mdx_extra_q"], allow_custom_value=False, visible=False, value="htdemucs", info="Select the splitter model (Default: htdemucs)")
419
+ vc_split = gr.Button("Split Audio", variant="primary", visible=False)
420
+ vc_vocal_preview = gr.Audio(label="Vocal Preview", visible=False)
421
+ vc_inst_preview = gr.Audio(label="Instrumental Preview", visible=False)
422
+ vc_audio_preview = gr.Audio(label="Audio Preview", visible=False)
423
+ # TTS
424
+ tts_text = gr.Textbox(visible=True, label="TTS text", info="Text to speech input (There is a limit of 250 characters)", interactive=True)
425
+ tts_voice = gr.Dropdown(label="Edge-tts speaker", choices=voices, visible=True, allow_custom_value=False, value="English-Ana (Female)", interactive=True)
426
+ # Record Own
427
+ record_button = gr.Audio(source="microphone", label="Record your own audio", visible=False, interactive=True)
428
+ vc_transform0 = gr.Number(label="Transpose", value=0, info='Type "12" to change from male to female voice. Type "-12" to change female to male voice')
429
+ f0method0 = gr.Radio(
430
+ label="Pitch extraction algorithm",
431
+ info=f0method_info,
432
+ choices=f0method_mode,
433
+ value="pm",
434
+ interactive=True
435
+ )
436
+ index_rate1 = gr.Slider(
437
+ minimum=0,
438
+ maximum=1,
439
+ label="Retrieval feature ratio",
440
+ info="Accent control. Too high will usually sound too robotic. (Default: 0.4)",
441
+ value=0.4,
442
+ interactive=True,
443
+ )
444
+ filter_radius0 = gr.Slider(
445
+ minimum=0,
446
+ maximum=7,
447
+ label="Apply Median Filtering",
448
+ info="The value represents the filter radius and can reduce breathiness.",
449
+ value=1,
450
+ step=1,
451
+ interactive=True,
452
+ )
453
+ resample_sr0 = gr.Slider(
454
+ minimum=0,
455
+ maximum=48000,
456
+ label="Resample the output audio",
457
+ info="Resample the output audio in post-processing to the final sample rate. Set to 0 for no resampling.",
458
+ value=0,
459
+ step=1,
460
+ interactive=True,
461
+ )
462
+ rms_mix_rate0 = gr.Slider(
463
+ minimum=0,
464
+ maximum=1,
465
+ label="Volume Envelope",
466
+ info="Use the volume envelope of the input to replace or mix with the volume envelope of the output. The closer the ratio is to 1, the more the output envelope is used",
467
+ value=1,
468
+ interactive=True,
469
+ )
470
+ protect0 = gr.Slider(
471
+ minimum=0,
472
+ maximum=0.5,
473
+ label="Voice Protection",
474
+ info="Protect voiceless consonants and breath sounds to prevent artifacts such as tearing in electronic music. Set to 0.5 to disable. Decrease the value to increase protection, but it may reduce indexing accuracy",
475
+ value=0.23,
476
+ step=0.01,
477
+ interactive=True,
478
+ )
479
+
480
  with gr.Blocks(theme=gr.themes.Base()) as app:
481
  gr.Markdown(
482
  "# <center> VTuber RVC Models\n"
 
504
  (f'<img style="width:auto;height:300px;" src="file/{cover}">' if cover else "")+
505
  '</div>'
506
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  with gr.Column():
508
  vc_log = gr.Textbox(label="Output Information", interactive=False)
509
  vc_output = gr.Audio(label="Output Audio", interactive=False)
 
512
  vc_mp = gr.Textbox(value=model_path, visible=False, interactive=False)
513
  vc_mi = gr.Textbox(value=model_index, visible=False, interactive=False)
514
  vc_convert = gr.Button("Convert", variant="primary")
515
+
516
+ vc_convert.click(
517
+ fn=infer,
518
+ inputs=[
519
+ vc_name,
520
+ vc_mp,
521
+ vc_mi,
522
+ vc_audio_mode,
523
+ vc_input,
524
+ vc_upload,
525
+ tts_text,
526
+ tts_voice,
527
+ vc_transform0,
528
+ f0method0,
529
+ index_rate1,
530
+ filter_radius0,
531
+ resample_sr0,
532
+ rms_mix_rate0,
533
+ protect0,
534
+ record_button
535
+ ],
536
+ outputs=[vc_log, vc_output]
537
+ )
538
+
539
  vc_volume = gr.Slider(
540
  minimum=0,
541
  maximum=10,
 
549
  vc_combined_output = gr.Audio(label="Output Combined Audio", visible=False)
550
  vc_combine = gr.Button("Combine",variant="primary", visible=False)
551
 
552
+ with gr.Row():
553
+ with gr.Column():
554
+ vc_audio_mode.render()
555
+ vc_input.render()
556
+ vc_upload.render()
557
+ # Youtube
558
+ vc_download_audio.render()
559
+ vc_link.render()
560
+ vc_split_model.render()
561
+ vc_split.render()
562
+ vc_vocal_preview.render()
563
+ vc_inst_preview.render()
564
+ vc_audio_preview.render()
565
+ # TTS
566
+ tts_text.render()
567
+ tts_voice.render()
568
+ # Record Own
569
+ record_button.render()
570
+ with gr.Column():
571
+ vc_transform0.render()
572
+ f0method0.render()
573
+ index_rate1.render()
574
+ with gr.Accordion("Advanced Options", open=False):
575
+ filter_radius0.render()
576
+ resample_sr0.render()
577
+ rms_mix_rate0.render()
578
+ protect0.render()
579
+
580
+ vc_split.click(
581
+ fn=cut_vocal_and_inst,
582
+ inputs=[vc_link, vc_download_audio, vc_split_model],
583
+ outputs=[vc_vocal_preview, vc_inst_preview, vc_audio_preview, vc_input]
584
+ )
585
+ vc_combine.click(
586
+ fn=combine_vocal_and_inst,
587
+ inputs=[vc_output, vc_volume, vc_split_model],
588
+ outputs=[vc_combined_output]
589
+ )
590
+ vc_audio_mode.change(
591
+ fn=change_audio_mode,
592
+ inputs=[vc_audio_mode],
593
+ outputs=[
594
+ vc_input,
595
+ vc_upload,
596
+ vc_download_audio,
597
+ vc_link,
598
+ vc_split_model,
599
+ vc_split,
600
+ vc_vocal_preview,
601
+ vc_inst_preview,
602
+ vc_audio_preview,
603
+ vc_volume,
604
+ vc_combined_output,
605
+ vc_combine,
606
+ tts_text,
607
+ tts_voice,
608
+ record_button
609
+ ]
610
+ )
611
  gr.Markdown(
612
  "## <center>Credit to:\n"
613
  "#### <center>Original devs:\n"