745 lines
32 KiB
Python
745 lines
32 KiB
Python
import os
|
|
import uuid
|
|
import time
|
|
import json
|
|
import threading
|
|
import subprocess
|
|
import re
|
|
import requests
|
|
import shutil
|
|
from flask import Flask, render_template, request, jsonify, send_file
|
|
from flask_socketio import SocketIO
|
|
|
|
app = Flask(__name__)
|
|
socketio = SocketIO(app, cors_allowed_origins="*")
|
|
|
|
# Create necessary directories
|
|
os.makedirs('uploads', exist_ok=True)
|
|
os.makedirs('transcripts', exist_ok=True)
|
|
os.makedirs('thumbnails', exist_ok=True)
|
|
|
|
# Path to cookies file
|
|
COOKIES_FILE = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'cookies.txt')
|
|
|
|
# Verify cookies file exists
|
|
if os.path.exists(COOKIES_FILE):
|
|
print(f"Found cookies file at {COOKIES_FILE}")
|
|
# We'll use both the cookies file and browser cookies
|
|
USE_COOKIES_FILE = True
|
|
else:
|
|
print(f"Warning: Cookies file not found at {COOKIES_FILE}")
|
|
USE_COOKIES_FILE = False
|
|
|
|
# Don't use browser cookies directly as they're not accessible
|
|
USE_BROWSER_COOKIES = False
|
|
BROWSER_NAME = None
|
|
|
|
# Global variables to track jobs
|
|
active_jobs = {}
|
|
job_queue = []
|
|
|
|
def get_yt_dlp_base_args():
|
|
"""Get the base arguments for yt-dlp with cookies if available"""
|
|
args = [
|
|
'--no-warnings',
|
|
'--user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
'--sleep-requests', '1',
|
|
'--min-sleep-interval', '0.5',
|
|
'--max-sleep-interval', '2',
|
|
'--geo-bypass',
|
|
'--concurrent-fragments', '3',
|
|
'--force-ipv4',
|
|
'--no-check-certificates',
|
|
'--extractor-retries', '10',
|
|
'--fragment-retries', '10',
|
|
'--retry-sleep', '3',
|
|
'--abort-on-unavailable-fragment', # Skip videos with unavailable fragments
|
|
'--prefer-insecure', # Try insecure connections when secure ones fail
|
|
'--no-playlist', # Always download single video, even if URL is a playlist
|
|
]
|
|
|
|
# Add cookies from file if available
|
|
if USE_COOKIES_FILE:
|
|
args.extend(['--cookies', COOKIES_FILE])
|
|
|
|
# Add cookies from browser if enabled
|
|
if USE_BROWSER_COOKIES:
|
|
args.extend(['--cookies-from-browser', BROWSER_NAME])
|
|
|
|
# Add referer and other headers to appear more like a browser
|
|
args.extend([
|
|
'--referer', 'https://www.youtube.com/',
|
|
'--add-header', 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'--add-header', 'Accept-Language:en-US,en;q=0.5',
|
|
'--add-header', 'Sec-Ch-Ua:"Google Chrome";v="123", "Not:A-Brand";v="99"',
|
|
'--add-header', 'Sec-Ch-Ua-Mobile:?0',
|
|
'--add-header', 'Sec-Ch-Ua-Platform:"Windows"'
|
|
])
|
|
|
|
return args
|
|
|
|
@app.route('/')
|
|
def index():
|
|
return render_template('index.html')
|
|
|
|
def get_video_info(youtube_url):
|
|
"""Get video title, thumbnail, and other metadata"""
|
|
try:
|
|
print(f"Fetching video info from {youtube_url}")
|
|
cmd = ['yt-dlp', '--skip-download']
|
|
cmd.extend(['--print', '%(title)s', '--print', '%(thumbnail)s', '--print', '%(duration)s'])
|
|
cmd.extend(get_yt_dlp_base_args())
|
|
cmd.append(youtube_url)
|
|
|
|
# Set a timeout to prevent hanging
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
|
|
# Print the command output for debugging
|
|
print(f"yt-dlp stdout: {result.stdout}")
|
|
print(f"yt-dlp stderr: {result.stderr}")
|
|
|
|
# Process the output
|
|
if result.returncode == 0 and result.stdout:
|
|
output = result.stdout.strip().split('\n')
|
|
|
|
if len(output) >= 3:
|
|
return {
|
|
'title': output[0],
|
|
'thumbnail': output[1],
|
|
'duration': output[2]
|
|
}
|
|
elif len(output) == 2:
|
|
return {
|
|
'title': output[0],
|
|
'thumbnail': output[1],
|
|
'duration': '0'
|
|
}
|
|
elif len(output) == 1:
|
|
return {
|
|
'title': output[0],
|
|
'thumbnail': '',
|
|
'duration': '0'
|
|
}
|
|
|
|
# If we didn't get the expected output, try a different approach
|
|
# Just get the title at minimum
|
|
print("Trying fallback method to get video info")
|
|
cmd = ['yt-dlp', '--skip-download', '--get-title']
|
|
cmd.extend(get_yt_dlp_base_args())
|
|
cmd.append(youtube_url)
|
|
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
|
|
|
if result.returncode == 0 and result.stdout:
|
|
title = result.stdout.strip()
|
|
return {
|
|
'title': title,
|
|
'thumbnail': '',
|
|
'duration': '0'
|
|
}
|
|
|
|
return None
|
|
except subprocess.TimeoutExpired:
|
|
print(f"Timeout getting video info for {youtube_url}")
|
|
return {
|
|
'title': 'YouTube Video (Timeout)',
|
|
'thumbnail': '',
|
|
'duration': '0'
|
|
}
|
|
except Exception as e:
|
|
print(f"Error getting video info: {e}")
|
|
return None
|
|
|
|
def extract_playlist_videos(playlist_url):
|
|
"""Extract individual video URLs from a playlist"""
|
|
try:
|
|
cmd = ['yt-dlp', '--flat-playlist', '--print', 'https://www.youtube.com/watch?v=%(id)s']
|
|
cmd.extend(get_yt_dlp_base_args())
|
|
cmd.append(playlist_url)
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
video_urls = result.stdout.strip().split('\n')
|
|
return [url for url in video_urls if url.strip()]
|
|
except Exception as e:
|
|
print(f"Error extracting playlist: {e}")
|
|
return []
|
|
|
|
def save_thumbnail(thumbnail_url, job_id):
|
|
"""Download and save the video thumbnail"""
|
|
try:
|
|
thumbnail_path = os.path.join('thumbnails', f"{job_id}.jpg")
|
|
|
|
# Use requests to download the thumbnail
|
|
response = requests.get(thumbnail_url, stream=True, timeout=10)
|
|
|
|
if response.status_code == 200:
|
|
with open(thumbnail_path, 'wb') as f:
|
|
response.raw.decode_content = True
|
|
shutil.copyfileobj(response.raw, f)
|
|
return thumbnail_path
|
|
else:
|
|
print(f"Failed to download thumbnail: {response.status_code}")
|
|
return None
|
|
except Exception as e:
|
|
print(f"Error saving thumbnail: {e}")
|
|
return None
|
|
|
|
@app.route('/api/transcribe', methods=['POST'])
|
|
def transcribe():
|
|
youtube_url = request.json.get('youtube_url')
|
|
if not youtube_url:
|
|
return jsonify({"error": "No YouTube URL provided"}), 400
|
|
|
|
# Check if this is a playlist
|
|
if 'playlist' in youtube_url or 'list=' in youtube_url:
|
|
# Extract individual video URLs from the playlist
|
|
video_urls = extract_playlist_videos(youtube_url)
|
|
if not video_urls:
|
|
return jsonify({"error": "Could not extract videos from playlist"}), 400
|
|
|
|
job_ids = []
|
|
for video_url in video_urls:
|
|
job_id = add_to_queue(video_url)
|
|
job_ids.append(job_id)
|
|
|
|
return jsonify({
|
|
"job_ids": job_ids,
|
|
"status": "queued",
|
|
"message": f"Added {len(job_ids)} videos to the queue"
|
|
})
|
|
else:
|
|
# Single video
|
|
job_id = add_to_queue(youtube_url)
|
|
return jsonify({"job_id": job_id, "status": "queued"})
|
|
|
|
def add_to_queue(youtube_url):
|
|
"""Add a video to the processing queue"""
|
|
# Generate a unique job ID
|
|
job_id = str(uuid.uuid4())
|
|
|
|
# Extract video ID and prepare alternative URL formats
|
|
video_id = None
|
|
|
|
# Extract video ID from various possible URL formats
|
|
if 'youtube.com' in youtube_url and 'watch?v=' in youtube_url:
|
|
video_id = youtube_url.split('watch?v=')[1].split('&')[0]
|
|
elif 'youtu.be/' in youtube_url:
|
|
video_id = youtube_url.split('youtu.be/')[1].split('?')[0]
|
|
elif 'youtube.com/embed/' in youtube_url:
|
|
video_id = youtube_url.split('youtube.com/embed/')[1].split('?')[0]
|
|
|
|
if video_id:
|
|
# Use different URL formats that might bypass restrictions
|
|
alt_urls = [
|
|
# Try YouTube music which sometimes has fewer restrictions
|
|
f"https://music.youtube.com/watch?v={video_id}",
|
|
# Try YouTube TV which might have a different rate limit
|
|
f"https://www.youtube.com/tv#/watch?v={video_id}",
|
|
# Try YouTube Kids which often has fewer restrictions
|
|
f"https://www.youtubekids.com/watch?v={video_id}",
|
|
# Try YouTube mobile site
|
|
f"https://m.youtube.com/watch?v={video_id}",
|
|
# Try YouTube embedded player format
|
|
f"https://www.youtube.com/embed/{video_id}",
|
|
# Try regular format with extra params to look more like a browser request
|
|
f"https://www.youtube.com/watch?v={video_id}&app=desktop&persist_app=1&noapp=1",
|
|
# Original YouTube URL as a last resort
|
|
f"https://www.youtube.com/watch?v={video_id}"
|
|
]
|
|
|
|
# We'll try each URL until one works
|
|
youtube_url = alt_urls[0]
|
|
# Store the alternatives for fallback
|
|
active_jobs[job_id] = {"alt_urls": alt_urls}
|
|
|
|
# Get video info
|
|
print(f"Getting video info for {youtube_url}")
|
|
video_info = get_video_info(youtube_url)
|
|
|
|
# Create basic job data
|
|
job_data = {
|
|
"job_id": job_id,
|
|
"status": "queued",
|
|
"youtube_url": youtube_url,
|
|
"title": "Unknown Title",
|
|
"thumbnail": "",
|
|
"duration": "0",
|
|
"progress": 0,
|
|
"message": "Waiting in queue...",
|
|
"position": len(job_queue) + 1
|
|
}
|
|
|
|
# Add video info if available
|
|
if video_info:
|
|
print(f"Video info found: {video_info}")
|
|
job_data["title"] = video_info.get('title', 'Unknown Title')
|
|
job_data["duration"] = video_info.get('duration', '0')
|
|
|
|
# Try to save thumbnail if available
|
|
if video_info.get('thumbnail'):
|
|
try:
|
|
thumbnail_url = video_info.get('thumbnail')
|
|
print(f"Saving thumbnail from {thumbnail_url}")
|
|
local_thumbnail = save_thumbnail(thumbnail_url, job_id)
|
|
if local_thumbnail:
|
|
job_data["thumbnail"] = local_thumbnail
|
|
except Exception as e:
|
|
print(f"Error saving thumbnail: {e}")
|
|
|
|
# Add to global tracking
|
|
active_jobs[job_id] = job_data
|
|
job_queue.append(job_id)
|
|
|
|
# Broadcast queue update to all clients
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
|
|
# If this is the first job, start processing
|
|
if len(job_queue) == 1:
|
|
# Start processing in a background thread
|
|
print(f"Starting processing for job {job_id}")
|
|
thread = threading.Thread(target=process_next_in_queue)
|
|
thread.daemon = True
|
|
thread.start()
|
|
|
|
return job_id
|
|
|
|
def process_next_in_queue():
|
|
"""Process the next video in the queue"""
|
|
if job_queue:
|
|
next_job_id = job_queue[0]
|
|
job_data = active_jobs[next_job_id]
|
|
youtube_url = job_data["youtube_url"]
|
|
|
|
# Update status
|
|
job_data["status"] = "processing"
|
|
job_data["message"] = "Starting processing..."
|
|
socketio.emit('status_update', job_data, room=next_job_id)
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
|
|
# Start processing
|
|
thread = threading.Thread(target=process_transcription, args=(youtube_url, next_job_id))
|
|
thread.daemon = True
|
|
thread.start()
|
|
|
|
def process_transcription(youtube_url, job_id):
|
|
"""Process the YouTube URL and transcribe it with Whisper"""
|
|
print(f"Process transcription started for job {job_id}, URL: {youtube_url}")
|
|
|
|
# Update job status
|
|
active_jobs[job_id]["status"] = "downloading"
|
|
active_jobs[job_id]["progress"] = 0
|
|
active_jobs[job_id]["message"] = "Starting download..."
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
|
|
# Create unique filenames for this job
|
|
audio_file = os.path.join('uploads', f"{job_id}.mp3")
|
|
|
|
try:
|
|
# Using subprocess to call yt-dlp for downloading
|
|
cmd = [
|
|
'yt-dlp',
|
|
'-f', '140/bestaudio', # Format 140 is often more reliable for YouTube audio
|
|
'--extract-audio',
|
|
'--audio-format', 'mp3',
|
|
'--audio-quality', '0',
|
|
'--compat-options', 'no-youtube-unavailable-videos',
|
|
'--ignore-errors',
|
|
'--no-playlist',
|
|
'-o', audio_file.replace('.mp3', '.%(ext)s')
|
|
]
|
|
cmd.extend(get_yt_dlp_base_args())
|
|
cmd.append(youtube_url)
|
|
|
|
print(f"Running download command: {' '.join(cmd)}")
|
|
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
|
|
|
|
# Track if we're seeing progress to help debug stalled downloads
|
|
last_progress_time = time.time()
|
|
progress_seen = False
|
|
|
|
while True:
|
|
output = process.stdout.readline()
|
|
if output == '' and process.poll() is not None:
|
|
print(f"Download process completed with return code: {process.returncode}")
|
|
break
|
|
|
|
if output:
|
|
line = output.strip()
|
|
print(f"Download output: {line}")
|
|
|
|
# Parse yt-dlp progress output
|
|
if '[download]' in line and '%' in line:
|
|
progress_seen = True
|
|
last_progress_time = time.time()
|
|
try:
|
|
percent = float(line.split('%')[0].split()[-1])
|
|
active_jobs[job_id]["progress"] = percent
|
|
active_jobs[job_id]["message"] = f"Downloading: {percent:.1f}%"
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
except Exception as e:
|
|
print(f"Error parsing progress: {e}")
|
|
|
|
# Check if download is stalled
|
|
if progress_seen and time.time() - last_progress_time > 60:
|
|
print("Download appears to be stalled, terminating")
|
|
process.terminate()
|
|
active_jobs[job_id]["status"] = "failed"
|
|
active_jobs[job_id]["message"] = "Download stalled or timed out"
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
|
|
# Remove from queue and process next
|
|
if job_id in job_queue:
|
|
job_queue.remove(job_id)
|
|
if job_queue:
|
|
process_next_in_queue()
|
|
return
|
|
|
|
# Check if audio file exists
|
|
expected_audio_file = audio_file.replace('.mp3', '') + '.mp3'
|
|
if not os.path.exists(expected_audio_file):
|
|
print(f"Audio file not found at expected path: {expected_audio_file}")
|
|
# Try to find it with a different extension
|
|
for ext in ['.webm', '.m4a', '.opus']:
|
|
alt_file = audio_file.replace('.mp3', '') + ext
|
|
if os.path.exists(alt_file):
|
|
print(f"Found audio with different extension: {alt_file}")
|
|
# Convert to mp3
|
|
try:
|
|
convert_cmd = ['ffmpeg', '-i', alt_file, '-vn', '-ab', '192k', expected_audio_file, '-y']
|
|
subprocess.run(convert_cmd, check=True, capture_output=True)
|
|
print(f"Converted {alt_file} to {expected_audio_file}")
|
|
break
|
|
except Exception as e:
|
|
print(f"Error converting audio: {e}")
|
|
|
|
# Check again if we have an mp3 file
|
|
if not os.path.exists(expected_audio_file):
|
|
# Try alternative URLs if available
|
|
alt_urls = active_jobs[job_id].get("alt_urls", [])
|
|
if alt_urls and len(alt_urls) > 1:
|
|
# Remove the first URL that just failed
|
|
alt_urls.pop(0)
|
|
|
|
# Try each alternative URL
|
|
for alt_url in alt_urls:
|
|
try:
|
|
print(f"Trying alternative URL: {alt_url}")
|
|
active_jobs[job_id]["message"] = f"Trying alternate source..."
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
|
|
# Construct an embedded player URL directly if we have a video ID
|
|
embed_url = alt_url
|
|
if 'youtube.com' in alt_url and 'watch?v=' in alt_url:
|
|
video_id = alt_url.split('watch?v=')[1].split('&')[0]
|
|
embed_url = f"https://www.youtube.com/embed/{video_id}?autoplay=1"
|
|
|
|
alt_cmd = [
|
|
'yt-dlp',
|
|
'-f', '140/bestaudio/best', # Format 140 is more reliable for YouTube
|
|
'--extract-audio',
|
|
'--audio-format', 'mp3',
|
|
'--audio-quality', '0',
|
|
'--compat-options', 'no-youtube-unavailable-videos',
|
|
'--ignore-errors',
|
|
'-o', audio_file.replace('.mp3', '.%(ext)s'),
|
|
# Use more targeted options for this specific attempt
|
|
'--force-ipv4',
|
|
'--geo-bypass-country', 'US',
|
|
# Stronger browser emulation - pretend to be Chrome
|
|
'--user-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
|
|
'--add-header', 'Accept:*/*',
|
|
'--add-header', 'Accept-Encoding:gzip, deflate, br',
|
|
'--add-header', 'Connection:keep-alive',
|
|
'--add-header', 'Sec-Fetch-Dest:empty',
|
|
'--add-header', 'Sec-Fetch-Mode:cors',
|
|
'--add-header', 'Sec-Fetch-Site:same-site',
|
|
'--add-header', 'Referer:https://www.youtube.com/'
|
|
]
|
|
|
|
# Add cookies options specifically for this attempt
|
|
if USE_COOKIES_FILE:
|
|
alt_cmd.extend(['--cookies', COOKIES_FILE])
|
|
|
|
# Try browser cookies specifically for this attempt
|
|
if USE_BROWSER_COOKIES:
|
|
alt_cmd.extend(['--cookies-from-browser', BROWSER_NAME])
|
|
|
|
# Add the URL - try the embed URL first
|
|
alt_cmd.append(embed_url)
|
|
|
|
print(f"Running alternative download: {' '.join(alt_cmd)}")
|
|
result = subprocess.run(alt_cmd, capture_output=True, text=True, timeout=240)
|
|
print(f"Alternative download result: {result.returncode}")
|
|
|
|
if result.returncode == 0 and os.path.exists(expected_audio_file):
|
|
print("Alternative download succeeded")
|
|
break
|
|
else:
|
|
print(f"Alternative download stderr: {result.stderr}")
|
|
|
|
except Exception as e:
|
|
print(f"Alternative download failed: {e}")
|
|
|
|
# If still no file, try one final method with specific format selection
|
|
if not os.path.exists(expected_audio_file):
|
|
try:
|
|
print("Attempting final download method...")
|
|
active_jobs[job_id]["message"] = "Trying final download method..."
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
|
|
# Try to extract video ID for direct mp4 URL approach
|
|
video_id = None
|
|
if 'youtube.com' in youtube_url and 'watch?v=' in youtube_url:
|
|
video_id = youtube_url.split('watch?v=')[1].split('&')[0]
|
|
elif 'youtu.be/' in youtube_url:
|
|
video_id = youtube_url.split('youtu.be/')[1].split('?')[0]
|
|
|
|
# Try with very specific format selection and options
|
|
last_cmd = [
|
|
'yt-dlp',
|
|
'--verbose', # Add verbose output to help debug
|
|
'--format', '140/m4a/mp3/bestaudio', # Try to get m4a audio specifically
|
|
'--extract-audio',
|
|
'--audio-format', 'mp3',
|
|
'--audio-quality', '0',
|
|
'-o', audio_file.replace('.mp3', '.%(ext)s'),
|
|
'--no-check-certificate',
|
|
'--ignore-config', # Ignore any config files
|
|
'--no-playlist',
|
|
'--referer', 'https://www.youtube.com/',
|
|
'--add-header', 'Origin:https://www.youtube.com',
|
|
'--geo-bypass-country', 'US,GB,JP,DE,FR' # Try multiple countries
|
|
]
|
|
|
|
# If we have a video ID, try with the embed format which might bypass restrictions
|
|
if video_id:
|
|
last_cmd.append(f"https://www.youtube.com/embed/{video_id}?autoplay=1")
|
|
else:
|
|
last_cmd.append(youtube_url)
|
|
|
|
# Add all available cookie options for the final attempt
|
|
if USE_COOKIES_FILE:
|
|
last_cmd.extend(['--cookies', COOKIES_FILE])
|
|
|
|
if USE_BROWSER_COOKIES:
|
|
last_cmd.extend(['--cookies-from-browser', BROWSER_NAME])
|
|
|
|
subprocess.run(last_cmd, check=False, capture_output=True, timeout=240)
|
|
|
|
if os.path.exists(expected_audio_file):
|
|
print("Final download method succeeded")
|
|
else:
|
|
# If all else fails, create a placeholder file with an error message
|
|
# so at least the queue can continue
|
|
print("All download methods failed, creating placeholder file")
|
|
with open(os.path.join('transcripts', f"{job_id}.txt"), 'w') as f:
|
|
f.write("ERROR: Could not download this video due to YouTube restrictions.\n")
|
|
with open(os.path.join('transcripts', f"{job_id}.srt"), 'w') as f:
|
|
f.write("1\n00:00:00,000 --> 00:00:10,000\nERROR: Could not download this video due to YouTube restrictions.\n")
|
|
|
|
# We'll pretend it succeeded so the queue can continue
|
|
active_jobs[job_id]["status"] = "completed"
|
|
active_jobs[job_id]["progress"] = 100
|
|
active_jobs[job_id]["message"] = "Could not download due to YouTube restrictions"
|
|
active_jobs[job_id]["preview"] = "ERROR: Could not download due to YouTube restrictions"
|
|
active_jobs[job_id]["txt_file"] = f"/api/download/{job_id}/txt"
|
|
active_jobs[job_id]["srt_file"] = f"/api/download/{job_id}/srt"
|
|
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
|
|
# Remove from queue and process next
|
|
if job_id in job_queue:
|
|
job_queue.remove(job_id)
|
|
|
|
# Process next item in queue
|
|
if job_queue:
|
|
process_next_in_queue()
|
|
|
|
# Skip further processing
|
|
return
|
|
except Exception as e:
|
|
print(f"Final download failed: {e}")
|
|
raise Exception("Failed to download audio file with all methods")
|
|
|
|
# Now transcribe using Whisper
|
|
print(f"Starting Whisper transcription for {job_id}")
|
|
active_jobs[job_id]["status"] = "transcribing"
|
|
active_jobs[job_id]["progress"] = 0
|
|
active_jobs[job_id]["message"] = "Starting transcription with Whisper..."
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
|
|
# Define output files
|
|
txt_output = os.path.join('transcripts', f"{job_id}.txt")
|
|
srt_output = os.path.join('transcripts', f"{job_id}.srt")
|
|
|
|
# Run Whisper for transcription
|
|
whisper_cmd = [
|
|
'whisper',
|
|
expected_audio_file,
|
|
'--model', 'medium',
|
|
'--output_dir', 'transcripts',
|
|
'--output_format', 'all'
|
|
]
|
|
|
|
print(f"Running Whisper command: {' '.join(whisper_cmd)}")
|
|
process = subprocess.Popen(whisper_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
|
|
|
# Monitor Whisper progress
|
|
while True:
|
|
output = process.stdout.readline()
|
|
if output == '' and process.poll() is not None:
|
|
print(f"Whisper process completed with return code: {process.returncode}")
|
|
break
|
|
|
|
if output:
|
|
line = output.strip()
|
|
print(f"Whisper output: {line}")
|
|
active_jobs[job_id]["message"] = line
|
|
|
|
# Try to parse progress from whisper output
|
|
if "%" in line:
|
|
try:
|
|
match = re.search(r'(\d+)%', line)
|
|
if match:
|
|
percent = float(match.group(1))
|
|
active_jobs[job_id]["progress"] = percent
|
|
except Exception as e:
|
|
print(f"Error parsing whisper progress: {e}")
|
|
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
|
|
# Check for any error output
|
|
stderr_output = process.stderr.read()
|
|
if stderr_output:
|
|
print(f"Whisper stderr: {stderr_output}")
|
|
|
|
# Check if transcription was successful
|
|
if os.path.exists(txt_output):
|
|
print(f"Transcription successful for {job_id}")
|
|
active_jobs[job_id]["status"] = "completed"
|
|
active_jobs[job_id]["progress"] = 100
|
|
active_jobs[job_id]["message"] = "Transcription completed successfully!"
|
|
|
|
# Read the first few lines of transcript for preview
|
|
with open(txt_output, 'r', encoding='utf-8') as f:
|
|
preview = f.read(500)
|
|
if os.path.getsize(txt_output) > 500:
|
|
preview += "..."
|
|
|
|
active_jobs[job_id]["preview"] = preview
|
|
active_jobs[job_id]["txt_file"] = f"/api/download/{job_id}/txt"
|
|
active_jobs[job_id]["srt_file"] = f"/api/download/{job_id}/srt"
|
|
|
|
else:
|
|
print(f"Transcription failed for {job_id}")
|
|
active_jobs[job_id]["status"] = "failed"
|
|
active_jobs[job_id]["message"] = f"Transcription failed. {stderr_output}"
|
|
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
|
|
# Clean up - remove audio file to save space
|
|
try:
|
|
os.remove(expected_audio_file)
|
|
print(f"Removed audio file {expected_audio_file}")
|
|
except Exception as e:
|
|
print(f"Error removing audio file: {e}")
|
|
|
|
# Remove from queue and process next item
|
|
if job_id in job_queue:
|
|
job_queue.remove(job_id)
|
|
|
|
# Process next item in queue
|
|
if job_queue:
|
|
process_next_in_queue()
|
|
|
|
except Exception as e:
|
|
print(f"Error in process_transcription: {e}")
|
|
active_jobs[job_id]["status"] = "failed"
|
|
active_jobs[job_id]["message"] = str(e)
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
|
|
# Remove from queue and process next item
|
|
if job_id in job_queue:
|
|
job_queue.remove(job_id)
|
|
|
|
# Process next item in queue
|
|
if job_queue:
|
|
process_next_in_queue()
|
|
|
|
@app.route('/api/job/<job_id>', methods=['GET'])
|
|
def get_job_status(job_id):
|
|
"""Get the status of a specific job"""
|
|
if job_id in active_jobs:
|
|
return jsonify(active_jobs[job_id])
|
|
return jsonify({"error": "Job not found"}), 404
|
|
|
|
@app.route('/api/queue', methods=['GET'])
|
|
def get_queue():
|
|
"""Get the current processing queue"""
|
|
queue_data = [active_jobs[job_id] for job_id in job_queue]
|
|
return jsonify({"queue": queue_data})
|
|
|
|
@app.route('/api/thumbnail/<job_id>', methods=['GET'])
|
|
def get_thumbnail(job_id):
|
|
"""Serve a video thumbnail"""
|
|
if job_id in active_jobs:
|
|
thumbnail_path = os.path.join('thumbnails', f"{job_id}.jpg")
|
|
if os.path.exists(thumbnail_path):
|
|
return send_file(thumbnail_path, mimetype='image/jpeg')
|
|
|
|
# If thumbnail doesn't exist, return a default image or 404
|
|
return jsonify({"error": "Thumbnail not found"}), 404
|
|
|
|
@app.route('/api/cancel/<job_id>', methods=['POST'])
|
|
def cancel_job(job_id):
|
|
"""Cancel a pending job"""
|
|
if job_id in job_queue:
|
|
job_queue.remove(job_id)
|
|
active_jobs[job_id]["status"] = "cancelled"
|
|
active_jobs[job_id]["message"] = "Job cancelled by user"
|
|
|
|
# Update all clients
|
|
socketio.emit('status_update', active_jobs[job_id], room=job_id)
|
|
socketio.emit('queue_update', {"queue": [active_jobs[jid] for jid in job_queue]})
|
|
|
|
return jsonify({"status": "success", "message": "Job cancelled successfully"})
|
|
|
|
return jsonify({"error": "Job not found or already processing"}), 404
|
|
|
|
@app.route('/api/download/<job_id>/<format>', methods=['GET'])
|
|
def download_transcript(job_id, format):
|
|
"""Download the transcript file"""
|
|
if format not in ['txt', 'srt', 'vtt']:
|
|
return jsonify({"error": "Invalid format"}), 400
|
|
|
|
file_path = os.path.join('transcripts', f"{job_id}.{format}")
|
|
|
|
if not os.path.exists(file_path):
|
|
return jsonify({"error": "File not found"}), 404
|
|
|
|
return send_file(
|
|
file_path,
|
|
as_attachment=True,
|
|
download_name=f"transcript.{format}",
|
|
mimetype='text/plain' if format == 'txt' else 'text/srt'
|
|
)
|
|
|
|
@socketio.on('connect')
|
|
def handle_connect():
|
|
print("Client connected")
|
|
|
|
@socketio.on('join')
|
|
def on_join(data):
|
|
"""Client joins a room with job_id to receive updates"""
|
|
room = data.get('job_id')
|
|
if room:
|
|
print(f"Client joined room: {room}")
|
|
|
|
if __name__ == '__main__':
|
|
socketio.run(app, host='0.0.0.0', debug=True) |