Spaces:

darshankr
/

indic-asr

Runtime error

App Files Files Community

Darshan commited on Oct 27

Commit

1f89c40

•

1 Parent(s): 05a7f27

use different setup

Browse files

Files changed (3) hide show

Dockerfile +22 -16
app.py +111 -34
requirements.txt +24 -8

Dockerfile CHANGED Viewed

@@ -1,28 +1,34 @@
-# Use a lightweight Python image
 FROM python:3.10-slim
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
-    git ffmpeg wget bash && \
-    rm -rf /var/lib/apt/lists/*
-# Set working directory
-WORKDIR /app
-# Clone NeMo from the specific branch and install it
 RUN git clone https://github.com/AI4Bharat/NeMo.git && \
     cd NeMo && \
-    git checkout nemo-v2 && \
-    bash reinstall.sh
-# Copy the application code into the container
-COPY . .
-# Install remaining Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
-# Expose the application port
-EXPOSE 7860
-# Start the FastAPI app with Uvicorn
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.10-slim
+# Set working directory
+WORKDIR /app
 # Install system dependencies
 RUN apt-get update && apt-get install -y \
+    git \
+    build-essential \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Clone and install NeMo
 RUN git clone https://github.com/AI4Bharat/NeMo.git && \
     cd NeMo && \
+    pip install -e .
+# Copy application code
+COPY main.py .
+# Create directory for temporary files
+RUN mkdir -p /tmp/audio_files
+# Expose port
+EXPOSE 8000
+# Command to run the application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

app.py CHANGED Viewed

@@ -1,53 +1,130 @@
 from fastapi import FastAPI, File, UploadFile, HTTPException
 import nemo.collections.asr as nemo_asr
-import torch
 import shutil
 import os
 import uvicorn
-app = FastAPI()
-# Set the device (CPU or CUDA if available)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Load and configure the ASR model
-model = nemo_asr.models.ASRModel.from_pretrained(
-    "ai4bharat/indicconformer_stt_hi_hybrid_rnnt_large"
-)
-model.freeze()  # Set to inference mode
-model = model.to(device)
-model.cur_decoder = "rnnt"  # Use RNNT decoder
-UPLOAD_FOLDER = "./uploads"
-os.makedirs(UPLOAD_FOLDER, exist_ok=True)  # Create upload folder if it doesn't exist
-@app.post("/transcribe/")
-async def transcribe_audio(file: UploadFile = File(...), source_lang: str = "hi"):
-    try:
-        # Save the uploaded audio file to disk
-        file_path = os.path.join(UPLOAD_FOLDER, file.filename)
-        with open(file_path, "wb") as buffer:
-            shutil.copyfileobj(file.file, buffer)
-        # Perform transcription using the provided language ID
-        transcription = model.transcribe(
-            [file_path], batch_size=1, language_id=source_lang
-        )[0]
-        # Cleanup the uploaded file
-        os.remove(file_path)
-        return {"transcription": transcription}
-    except Exception as e:
         raise HTTPException(
-            status_code=500, detail=f"Error during transcription: {str(e)}"
         )
-# Run the app if inside a container
-if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
 import nemo.collections.asr as nemo_asr
 import shutil
 import os
+from tempfile import NamedTemporaryFile
+from typing import Dict
+from pydantic import BaseModel
 import uvicorn
+# Dictionary mapping language codes to model names
+LANGUAGE_MODELS = {
+    "hi": "ai4bharat/indicconformer_stt_hi_hybrid_ctc_rnnt_large",
+    "bn": "ai4bharat/indicconformer_stt_bn_hybrid_ctc_rnnt_large",
+    "ta": "ai4bharat/indicconformer_stt_ta_hybrid_ctc_rnnt_large",
+    # Add more languages and their corresponding models as needed
+}
+class TranscriptionResponse(BaseModel):
+    text: str
+    language: str
+app = FastAPI(
+    title="Indian Languages ASR API",
+    description="API for automatic speech recognition in Indian languages",
+    version="1.0.0",
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Cache for loaded models
+model_cache = {}
+def get_model(language: str):
+    """
+    Get or load the ASR model for the specified language
+    """
+    if language not in LANGUAGE_MODELS:
         raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported language: {language}. Supported languages are: {list(LANGUAGE_MODELS.keys())}",
         )
+    if language not in model_cache:
+        try:
+            model = nemo_asr.models.ASRModel.from_pretrained(LANGUAGE_MODELS[language])
+            model_cache[language] = model
+        except Exception as e:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Error loading model for language {language}: {str(e)}",
+            )
+    return model_cache[language]
+@app.post("/transcribe/", response_model=TranscriptionResponse)
+async def transcribe_audio(
+    language: str,
+    file: UploadFile = File(...),
+):
+    """
+    Transcribe audio file in the specified Indian language
+    Parameters:
+    - language: Language code (e.g., 'hi' for Hindi, 'bn' for Bengali)
+    - file: Audio file in WAV format
+    Returns:
+    - Transcription text and language
+    """
+    # Validate file format
+    if not file.filename.endswith(".wav"):
+        raise HTTPException(status_code=400, detail="Only WAV files are supported")
+    # Get the appropriate model
+    model = get_model(language)
+    # Save uploaded file temporarily
+    with NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
+        try:
+            # Copy uploaded file to temporary file
+            shutil.copyfileobj(file.file, temp_file)
+            temp_file.flush()
+            # Perform transcription
+            transcriptions = model.transcribe([temp_file.name])
+            if not transcriptions or len(transcriptions) == 0:
+                raise HTTPException(status_code=500, detail="Transcription failed")
+            return TranscriptionResponse(text=transcriptions[0], language=language)
+        except Exception as e:
+            raise HTTPException(
+                status_code=500, detail=f"Error during transcription: {str(e)}"
+            )
+        finally:
+            # Clean up temporary file
+            os.unlink(temp_file.name)
+@app.get("/languages/")
+async def get_supported_languages() -> Dict[str, str]:
+    """
+    Get list of supported languages and their model names
+    """
+    return LANGUAGE_MODELS
+@app.get("/health/")
+async def health_check():
+    """
+    Health check endpoint
+    """
+    return {"status": "healthy"}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt CHANGED Viewed

@@ -1,9 +1,25 @@
-fastapi
-uvicorn
-torch
-ffmpeg-python
-packaging
 huggingface_hub==0.23.2
-soundfile
-numpy
-setuptools

+# requirements.txt
+fastapi==0.104.1
+uvicorn==0.24.0
+python-multipart==0.0.6
+pydantic==2.4.2
+torch==2.1.0
+torchaudio==2.1.0
+torchvision==0.16.0
+packaging==23.2
 huggingface_hub==0.23.2
+numpy>=1.20.0
+soundfile>=0.12.1
+librosa>=0.10.1
+omegaconf>=2.3.0
+hydra-core>=1.3.2
+pytorch-lightning>=2.1.0
+webdataset>=0.1.62
+transformers>=4.36.0
+sacremoses>=0.0.53
+youtokentome>=1.0.6
+numpy<1.24.0
+einops>=0.6.1
+contextlib2>=21.6.0
+inflect>=7.0.0
+typing_extensions>=4.8.0