File size: 9,620 Bytes
0768472 7e9fae4 0768472 7e9fae4 0768472 7e9fae4 0768472 7e9fae4 0768472 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
from authlib.integrations.flask_client import OAuth
from authlib.common.security import generate_token
import ffmpeg
from flask import Flask, render_template, request, jsonify, url_for, redirect, session
from functools import wraps
import os
import streamlink
import threading
import time
from faster_whisper import WhisperModel
import subprocess
from datetime import datetime as dt
from datetime import timedelta, timezone
from apiclient import discovery
from google.oauth2 import service_account
import json
# Import secrets
client_secret = os.environ.get("client_secret")
gdoc_id = os.environ.get("gdoc_id")
GOOGLE_CLIENT_SECRET = os.environ.get("GOOGLE_CLIENT_SECRET")
GOOGLE_CLIENT_ID = os.environ.get("GOOGLE_CLIENT_ID")
allowed_users = os.environ.get("allowed_users")
# Faster Whisper setup
model_size = 'small'
beamsize = 2
wmodel = WhisperModel(model_size, device="cpu", compute_type="int8")
# Delete local_transcript if it exists
if not os.path.exists('transcription_files'): os.makedirs('transcription_files')
for f in os.listdir('transcription_files/'): os.remove(os.path.join('transcription_files/', f)) # clear any old files in transcription_files folder
with open("client_secret.json", "w") as json_file: json.dump(client_secret, json_file, indent=4)
scopes = ["https://www.googleapis.com/auth/documents", "https://www.googleapis.com/auth/drive.file"]
credentials = service_account.Credentials.from_service_account_file('client_secret.json', scopes=scopes)
service = discovery.build('docs', 'v1', credentials=credentials)
local_tz = 5.5 # For timestamps
local_transcript = 'transcription_files/tr.txt'
pid_file = 'transcription_files/pid.txt'
# Check if mp3 folder exists, and create it if it doesn't
if not os.path.exists('mp3'): os.makedirs('mp3')
# Delete any old files in mp3 folder
for f in os.listdir('mp3/'): os.remove(os.path.join('mp3/', f))
app = Flask(__name__, static_url_path='/static')
app.secret_key = os.urandom(12)
oauth = OAuth(app)
# Store the streamlink process
stream_process = None
recording = False
mp3_extraction_process = None
def update_gdoc(text, gdoc_id): # Update contents of google doc
print('Updating Google Doc', gdoc_id)
doc = service.documents().get(documentId=gdoc_id).execute()
endindex = [p['endIndex'] for p in doc['body']['content'] if 'paragraph' in p][-1]
try:
body = {'requests': [{'insertText': {'location': {'index': endindex-1,}, 'text': ' ' + text}}]}
result = service.documents().batchUpdate(documentId=gdoc_id, body=body).execute()
print(result)
except Exception as e:
print(e)
def process_complete_callback(retcode, **kwargs):
if retcode == 0:
print("FFmpeg process completed successfully!")
else:
print("FFmpeg process encountered an error.")
def transcribe_audio(latest_file, time_counter):
print('transcribing ', latest_file)
segments, info = wmodel.transcribe(f"{latest_file}", beam_size=beamsize) # beamsize is 2.
text = ''
for segment in segments:
text += segment.text
transcribed = text.replace('\n', ' ').replace(' ', ' ')
if time_counter%5 == 0:
transcribed_sents = transcribed.split('. ') # Get the first fullstop break and append to previous para, before adding time code
transcribed = transcribed_sents[0] + '\nTime ' + str((dt.now(timezone.utc) + timedelta(hours=local_tz)).strftime('%H:%M:%S')) + '\n' + '. '.join(transcribed_sents[1:])
time_counter += 1
return transcribed, time_counter
def save_audio(youtube_url):
global stream_process, recording, mp3_extraction_process
try:
streams = streamlink.streams(youtube_url)
#if "audio" not in streams:
# raise Exception("No audio stream found.")
stream_url = streams["144p"].url
time_counter = 0
while recording:
# Save audio only into mp3 files
saved_mp3 = f"mp3/audio_{int(time.time())}.mp3"
mp3_extraction_process = (
ffmpeg
.input(stream_url, t=30)
.audio
# TODO - change destination url to relevant url
.output(saved_mp3)
.overwrite_output()
.global_args('-loglevel', 'panic')
.run_async()
)
print('pid', mp3_extraction_process.pid)
# write the pid to pid_file
with open(pid_file, 'w') as f: f.write(str(mp3_extraction_process.pid))
# If there is more than one mp3 file in the folder, transcribe the one that is not being written to
mp3files = [f for f in os.listdir('mp3') if f.endswith('.mp3')]
if len(mp3files) < 2:
print('Sleeping for 30s as only one mp3 file in folder')
time.sleep(30)
else:
starttime = time.time()
file_to_transcribe = [f for f in mp3files if f != os.path.basename(saved_mp3)][0]
print('Working on ', file_to_transcribe)
transcribed, time_counter = transcribe_audio(f'mp3/{file_to_transcribe}', time_counter)
os.remove(f'mp3/{file_to_transcribe}')
update_gdoc(transcribed, gdoc_id)
with open(local_transcript, 'a', encoding='utf-8', errors='ignore') as f: f.write(transcribed)
elapsed_time = time.time() - starttime
print('Time to transcribe:', elapsed_time, 'seconds')
if elapsed_time < 30:
print(f'Sleeping for {30-elapsed_time} as there are more than one mp3 files in folder')
time.sleep(30-elapsed_time)
#time.sleep(30)
except Exception as e:
recording = False
print('exception', str(e))
return str(e)
@app.route("/start_process", methods=["POST"])
def start_process():
if not os.path.isfile(local_transcript):
global recording, stream_process
with open(local_transcript, 'a', encoding='utf-8', errors='ignore') as f: f.write('') # Create the local transcript file, which is used as a check to prevent multiple recordings
youtube_url = request.form.get("url")
if not youtube_url:
return jsonify({"message": "Please provide a valid YouTube URL."}), 400
if recording:
return jsonify({"message": "A recording is already in progress."}), 400
print('In start process')
recording = True
stream_process = threading.Thread(target=save_audio, args=(youtube_url,))
stream_process.start()
return jsonify({"message": "Recording started."}), 200
else: return jsonify({"message": "Recording is already in progress."}), 400
@app.route("/stop_process", methods=["POST"])
def stop_process():
global recording, stream_process, mp3_extraction_process
if not recording:
return jsonify({"message": "No recording is currently in progress."}), 400
print('In stop process')
recording = False
stream_process.join()
stream_process = None
mp3_extraction_process.terminate()
mp3_extraction_process = None
for f in os.listdir('mp3/'): os.remove(os.path.join('mp3/', f))
if os.path.isfile(local_transcript): os.remove(local_transcript)
# check if pid_file exists, get the pid inside it and convert to int, and use os.kill to kill it
if os.path.isfile(pid_file):
with open(pid_file, 'r') as f: pid = int(f.read())
try: os.kill(pid, 9) # For linux
except:
try:
process = subprocess.Popen(["taskkill", "/F", "/PID", str(pid)], stdout=subprocess.PIPE, stderr=subprocess.PIPE) # For Windows
process.communicate()
print("Process terminated successfully.")
except Exception as e:
print("Error:", e)
os.remove(pid_file)
return jsonify({"message": "Recording stopped."}), 200
@app.route('/google/')
def google():
CONF_URL = 'https://accounts.google.com/.well-known/openid-configuration'
oauth.register(
name='google',
client_id=GOOGLE_CLIENT_ID,
client_secret=GOOGLE_CLIENT_SECRET,
server_metadata_url=CONF_URL,
client_kwargs={"scope": "openid email profile"}
)
# Redirect to google_auth function/page
redirect_uri = url_for('google_auth', _external=True)
session['nonce'] = generate_token()
return oauth.google.authorize_redirect(redirect_uri, nonce=session['nonce'])
@app.route('/google/auth/')
def google_auth():
token = oauth.google.authorize_access_token()
user = oauth.google.parse_id_token(token, nonce=session['nonce'])
session['user'] = user
print('USER', user)
# Redirect to home if login successful
return redirect('/home')
def is_not_logged_in():
return session.get('user') is None or session.get('nonce') is None
# decorator to check if user is logged in, used for protected URLs
def login_required(f):
@wraps(f)
def decorated_function(*args, **kwargs):
if is_not_logged_in():
return redirect('/login')
return f(*args, **kwargs)
return decorated_function
@app.route("/home")
@login_required
def home():
return render_template("home.html")
@app.route("/", methods=["GET"])
@app.route("/login", methods=["GET"])
def login():
if not is_not_logged_in():
return redirect("/home")
return render_template("login.html")
if __name__ == "__main__":
app.run(host="0.0.0.0", debug=True, port=8081) |