File size: 1,518 Bytes
a4d117e
 
 
 
0a25040
a4d117e
 
 
b9c964e
a4d117e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44e8c42
0a25040
 
44e8c42
a4d117e
644ca35
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from typing import Dict
from transformers import pipeline
import requests
import jwt
import base64

SAMPLE_RATE = 16000

PUBLIC_KEY = b"-----BEGIN PUBLIC KEY-----\nMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIBCgKCAQEAu1SU1LfVLPHCozMxH2Mo\n4lgOEePzNm0tRgeLezV6ffAt0gunVTLw7onLRnrq0/IzW7yWR7QkrmBL7jTKEn5u\n+qKhbwKfBstIs+bMY2Zkp18gnTxKLxoS2tFczGkPLPgizskuemMghRniWaoLcyeh\nkd3qqGElvW/VDL5AaWTg0nLVkjRo9z+40RQzuVaE8AkAFmxZzow3x+VJYKdjykkJ\n0iT9wCS0DRTXu269V264Vf/3jvredZiKRkgwlL9xNAwxXFg0x/XFw005UWVRIkdg\ncKWTjpBP2dPwVZ4WWC+9aGVd+Gyn1o0CLelf4rEjGoXbAAEgAqeGUxrcIlbjXfbc\nmwIDAQAB\n-----END PUBLIC KEY-----"


class EndpointHandler:
    def __init__(self, path=""):
        self.pipeline = pipeline(
            "automatic-speech-recognition", model="openai/whisper-base"
        )

    def __call__(self, data: Dict[str, bytes]) -> Dict[str, str]:
        """
        Args:
            data (:obj:):
                includes the deserialized audio file as bytes
        Return:
            A :obj:`dict`:. base64 encoded image
        """
        # process input
        token = data.pop("token", None)
        if token is None:
            raise RuntimeError("missing token")
        decoded = jwt.decode(token, PUBLIC_KEY, algorithms=["RS512"])
        print("received input from jti=", decoded["jti"])
        inputs = data.pop("inputs", None)
        if isinstance(inputs, str):
            inputs = base64.b64decode(inputs)
        parameters = data.pop("parameters", {})

        return self.pipeline(inputs, **parameters)