Spaces:
Running
Running
{ | |
"models": [ | |
// Configuration for the built-in models. You can remove any of these | |
// if you don't want to use the default models. | |
{ | |
"name": "tiny", | |
"url": "tiny" | |
}, | |
{ | |
"name": "base", | |
"url": "base" | |
}, | |
{ | |
"name": "small", | |
"url": "small" | |
}, | |
{ | |
"name": "medium", | |
"url": "medium" | |
}, | |
{ | |
"name": "large", | |
"url": "large" | |
}, | |
{ | |
"name": "large-v2", | |
"url": "large-v2" | |
}, | |
// Uncomment to add custom Japanese models | |
// NOTE: For Faster-Whisper, the models must be converted to the CTranslate2 format, | |
// see https://github.com/guillaumekln/faster-whisper#model-conversion | |
//{ | |
// "name": "whisper-large-v2-mix-jp", | |
// "url": "arc-r/faster-whisper-large-v2-mix-jp", | |
// // The type of the model. Can be "huggingface" or "whisper" - "whisper" is the default. | |
// // HuggingFace models are loaded using the HuggingFace transformers library and then converted to Whisper models. | |
// "type": "huggingface", | |
//}, | |
//{ | |
// "name": "local-model", | |
// "url": "path/to/local/model", | |
//}, | |
//{ | |
// "name": "remote-model", | |
// "url": "https://example.com/path/to/model", | |
//} | |
], | |
// Configuration options that will be used if they are not specified in the command line arguments. | |
// * WEBUI options * | |
// Maximum audio file length in seconds, or -1 for no limit. Ignored by CLI. | |
"input_audio_max_duration": 1800, | |
// True to share the app on HuggingFace. | |
"share": false, | |
// The host or IP to bind to. If None, bind to localhost. | |
"server_name": null, | |
// The port to bind to. | |
"server_port": 7860, | |
// The number of workers to use for the web server. Use -1 to disable queueing. | |
"queue_concurrency_count": 1, | |
// Whether or not to automatically delete all uploaded files, to save disk space | |
"delete_uploaded_files": true, | |
// * General options * | |
// The default implementation to use for Whisper. Can be "whisper" or "faster-whisper". | |
// Note that you must either install the requirements for faster-whisper (requirements-fasterWhisper.txt) | |
// or whisper (requirements.txt) | |
"whisper_implementation": "faster-whisper", | |
// The default model name. | |
"default_model_name": "medium", | |
// The default VAD. | |
"default_vad": "silero-vad", | |
// A commma delimited list of CUDA devices to use for parallel processing. If None, disable parallel processing. | |
"vad_parallel_devices": "", | |
// The number of CPU cores to use for VAD pre-processing. | |
"vad_cpu_cores": 1, | |
// The number of seconds before inactivate processes are terminated. Use 0 to close processes immediately, or None for no timeout. | |
"vad_process_timeout": 1800, | |
// True to use all available GPUs and CPU cores for processing. Use vad_cpu_cores/vad_parallel_devices to specify the number of CPU cores/GPUs to use. | |
"auto_parallel": false, | |
// Directory to save the outputs (CLI will use the current directory if not specified) | |
"output_dir": null, | |
// The path to save model files; uses ~/.cache/whisper by default | |
"model_dir": null, | |
// Device to use for PyTorch inference, or Null to use the default device | |
"device": null, | |
// Whether to print out the progress and debug messages | |
"verbose": true, | |
// Whether to perform X->X speech recognition ('transcribe') or X->English translation ('translate') | |
"task": "transcribe", | |
// Language spoken in the audio, specify None to perform language detection | |
"language": null, | |
// The window size (in seconds) to merge voice segments | |
"vad_merge_window": 5, | |
// The maximum size (in seconds) of a voice segment | |
"vad_max_merge_size": 30, | |
// The padding (in seconds) to add to each voice segment | |
"vad_padding": 1, | |
// Whether or not to prepend the initial prompt to each VAD segment (prepend_all_segments), or just the first segment (prepend_first_segment) | |
"vad_initial_prompt_mode": "prepend_first_segment", | |
// The window size of the prompt to pass to Whisper | |
"vad_prompt_window": 3, | |
// Temperature to use for sampling | |
"temperature": 0, | |
// Number of candidates when sampling with non-zero temperature | |
"best_of": 5, | |
// Number of beams in beam search, only applicable when temperature is zero | |
"beam_size": 5, | |
// Optional patience value to use in beam decoding, as in https://arxiv.org/abs/2204.05424, the default (1.0) is equivalent to conventional beam search | |
"patience": 1, | |
// Optional token length penalty coefficient (alpha) as in https://arxiv.org/abs/1609.08144, uses simple length normalization by default | |
"length_penalty": null, | |
// Comma-separated list of token ids to suppress during sampling; '-1' will suppress most special characters except common punctuations | |
"suppress_tokens": "-1", | |
// Optional text to provide as a prompt for the first window | |
"initial_prompt": null, | |
// If True, provide the previous output of the model as a prompt for the next window; disabling may make the text inconsistent across windows, but the model becomes less prone to getting stuck in a failure loop | |
"condition_on_previous_text": true, | |
// Whether to perform inference in fp16; True by default | |
"fp16": true, | |
// The compute type used by faster-whisper. Can be "int8". "int16" or "float16". | |
"compute_type": "auto", | |
// Temperature to increase when falling back when the decoding fails to meet either of the thresholds below | |
"temperature_increment_on_fallback": 0.2, | |
// If the gzip compression ratio is higher than this value, treat the decoding as failed | |
"compression_ratio_threshold": 2.4, | |
// If the average log probability is lower than this value, treat the decoding as failed | |
"logprob_threshold": -1.0, | |
// If the probability of the <no-speech> token is higher than this value AND the decoding has failed due to `logprob_threshold`, consider the segment as silence | |
"no_speech_threshold": 0.6, | |
// (experimental) extract word-level timestamps and refine the results based on them | |
"word_timestamps": false, | |
// if word_timestamps is True, merge these punctuation symbols with the next word | |
"prepend_punctuations": "\"\'“¿([{-", | |
// if word_timestamps is True, merge these punctuation symbols with the previous word | |
"append_punctuations": "\"\'.。,,!!??::”)]}、", | |
// (requires --word_timestamps True) underline each word as it is spoken in srt and vtt | |
"highlight_words": false, | |
// Diarization settings | |
"auth_token": null, | |
// Whether to perform speaker diarization | |
"diarization": false, | |
// The number of speakers to detect | |
"diarization_speakers": 2, | |
// The minimum number of speakers to detect | |
"diarization_min_speakers": 1, | |
// The maximum number of speakers to detect | |
"diarization_max_speakers": 5, | |
} |