bilal6913's picture
Create app.py
0b7d904 verified
raw
history blame
1.91 kB
from flask import Flask, request, jsonify, render_template
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio
import os
# Initialize Flask app
app = Flask(__name__)
# Load the model and processor
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-arabic"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)
# Define the upload folder
UPLOAD_FOLDER = 'uploads'
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
# Ensure the upload folder exists
if not os.path.exists(UPLOAD_FOLDER):
os.makedirs(UPLOAD_FOLDER)
@app.route('/')
def index():
return render_template('index.html')
@app.route('/transcribe', methods=['POST'])
def transcribe_audio():
if 'file' not in request.files:
return jsonify({'error': 'No file part'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No selected file'}), 400
if file:
# Save the uploaded file
file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
file.save(file_path)
# Load the audio file
speech_array, sampling_rate = torchaudio.load(file_path)
speech_array = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)(speech_array)
# Process the audio input
input_values = processor(speech_array.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values
# Perform inference
with torch.no_grad():
logits = model(input_values).logits
# Get the predicted transcription
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)
return jsonify({'transcription': transcription[0]})
return jsonify({'error': 'Something went wrong!'}), 500
if __name__ == '__main__':
app.run(debug=True)