Spaces:
Sleeping
Sleeping
| import asyncio | |
| import base64 | |
| import io | |
| import logging | |
| import wave | |
| from typing import Optional | |
| import requests | |
| from elevenlabs.client import ElevenLabs | |
| from fastapi import FastAPI, File, HTTPException, UploadFile | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from google import genai | |
| from google.genai import types | |
| from PIL import Image | |
| from pydantic import BaseModel | |
| from pydub import AudioSegment | |
| from prompts import (ACCESSIBILITY_PROMPT, NARRATION_PROMPT, | |
| NARRATION_SYSTEM_PROMPT) | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| app = FastAPI( | |
| title="Accessibility Service API", | |
| description="API for generating audio narrations and making images accessible", | |
| version="1.0.0", | |
| ) | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| class ProcessImageRequest(BaseModel): | |
| imageUrl: str | |
| googleApiKey: str | |
| elevenlabsApiKey: Optional[str] = None | |
| class ProcessImageUploadRequest(BaseModel): | |
| googleApiKey: str | |
| elevenlabsApiKey: Optional[str] = None | |
| class ProcessImageResponse(BaseModel): | |
| accessibleImage: str | |
| description: str | |
| narrationURL: Optional[str] = None | |
| def get_google_client(api_key: str) -> genai.Client: | |
| """Create and return a Google Genai client with the provided API key""" | |
| try: | |
| return genai.Client(api_key=api_key) | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=400, detail=f"Failed to initialize Google client: {str(e)}" | |
| ) | |
| def get_elevenlabs_client(api_key: str) -> ElevenLabs: | |
| """Create and return an ElevenLabs client with the provided API key""" | |
| try: | |
| return ElevenLabs(api_key=api_key) | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=400, detail=f"Failed to initialize ElevenLabs client: {str(e)}" | |
| ) | |
| def download_image(url: str) -> Image.Image: | |
| """Download image from URL and return PIL Image object""" | |
| try: | |
| image_bytes = requests.get(url).content | |
| image = types.Part.from_bytes(data=image_bytes, mime_type="image/jpeg") | |
| return image | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=400, detail=f"Failed to download image: {str(e)}" | |
| ) | |
| def image_to_base64(image: Image.Image) -> str: | |
| """Convert PIL Image to base64 data URL""" | |
| buffer = io.BytesIO() | |
| image.save(buffer, format="PNG") | |
| img_bytes = buffer.getvalue() | |
| img_base64 = base64.b64encode(img_bytes).decode() | |
| return f"data:image/png;base64,{img_base64}" | |
| async def generate_description(image: Image.Image, google_client: genai.Client) -> str: | |
| """Generate a text description of the image using a generative AI model.""" | |
| try: | |
| response = google_client.models.generate_content( | |
| model="gemini-2.5-flash", | |
| contents=[image, NARRATION_PROMPT], | |
| config=types.GenerateContentConfig( | |
| system_instruction=NARRATION_SYSTEM_PROMPT, temperature=0.1 | |
| ), | |
| ) | |
| return response.text | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=500, detail=f"Failed to generate description: {str(e)}" | |
| ) | |
| def audio_to_base64(audio_bytes: bytes, mime: str = "audio/mpeg") -> str: | |
| """Convert audio bytes to base64 data URL""" | |
| audio_base64 = base64.b64encode(audio_bytes).decode() | |
| return f"data:{mime};base64,{audio_base64}" | |
| async def generate_narration_audio(text: str, elevenlabs_client) -> str: | |
| """Generate audio from ElevenLabs (MP3) and return base64 data URL.""" | |
| try: | |
| audio = elevenlabs_client.text_to_speech.convert( | |
| text=text, | |
| voice_id="XfNU2rGpBa01ckF309OY", | |
| model_id="eleven_multilingual_v2", | |
| output_format="mp3_44100_128", | |
| apply_text_normalization="on", | |
| ) | |
| audio_bytes = b"" | |
| for chunk in audio: | |
| audio_bytes += chunk | |
| return audio_to_base64(audio_bytes, mime="audio/mpeg") | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=500, detail=f"Failed to generate audio: {str(e)}" | |
| ) | |
| async def generate_narration_audio_gemini(description, google_client) -> str: | |
| """Generate audio from Gemini (PCM → WAV → MP3) and return base64 data URL.""" | |
| try: | |
| response = google_client.models.generate_content( | |
| model="gemini-2.5-flash-preview-tts", | |
| contents=description, | |
| config=types.GenerateContentConfig( | |
| response_modalities=["AUDIO"], | |
| speech_config=types.SpeechConfig( | |
| voice_config=types.VoiceConfig( | |
| prebuilt_voice_config=types.PrebuiltVoiceConfig( | |
| voice_name="Leda", | |
| ) | |
| ) | |
| ), | |
| ), | |
| ) | |
| audio_data = response.candidates[0].content.parts[0].inline_data.data | |
| if isinstance(audio_data, str): | |
| audio_data = base64.b64decode(audio_data) | |
| wav_buffer = io.BytesIO() | |
| with wave.open(wav_buffer, "wb") as wf: | |
| wf.setnchannels(1) # mono | |
| wf.setsampwidth(2) # 16-bit PCM | |
| wf.setframerate(24000) # 24kHz | |
| wf.writeframes(audio_data) | |
| wav_buffer.seek(0) | |
| audio_segment = AudioSegment.from_wav(wav_buffer) | |
| mp3_buffer = io.BytesIO() | |
| audio_segment.export(mp3_buffer, format="mp3") | |
| mp3_bytes = mp3_buffer.getvalue() | |
| return audio_to_base64(mp3_bytes, mime="audio/mpeg") | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=500, detail=f"Failed to generate audio (Gemini): {str(e)}" | |
| ) | |
| async def generate_accessible_image( | |
| original_image: Image.Image, google_client: genai.Client | |
| ) -> str: | |
| """ | |
| Generate an accessible version of the image using AI image generation. | |
| """ | |
| try: | |
| response = google_client.models.generate_content( | |
| model="gemini-2.5-flash-image-preview", | |
| contents=[ACCESSIBILITY_PROMPT, original_image], | |
| ) | |
| first_image = None | |
| for part in response.candidates[0].content.parts: | |
| if part.inline_data is not None: | |
| first_image = Image.open(io.BytesIO(part.inline_data.data)) | |
| if not first_image: | |
| raise HTTPException( | |
| status_code=500, detail="Failed to generate accessible image" | |
| ) | |
| return image_to_base64(first_image) | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=500, detail=f"Failed to generate accessible image: {str(e)}" | |
| ) | |
| async def _process( | |
| google_api_key: str, elevenlabs_api_key: Optional[str], original_image: Image.Image | |
| ) -> ProcessImageResponse: | |
| google_client = get_google_client(google_api_key) | |
| description, accessible_image_base64 = await asyncio.gather( | |
| generate_description(original_image, google_client), | |
| generate_accessible_image(original_image, google_client), | |
| ) | |
| logger.info(f"Generated image and description") | |
| # Generate narration only if ElevenLabs API key is provided | |
| narration_url = "None" | |
| if elevenlabs_api_key: | |
| try: | |
| elevenlabs_client = get_elevenlabs_client(elevenlabs_api_key) | |
| narration_url = await generate_narration_audio( | |
| description, elevenlabs_client | |
| ) | |
| except Exception as e: | |
| logger.warning(f"Warning: Failed to generate audio narration: {str(e)}") | |
| else: | |
| narration_url = await generate_narration_audio_gemini( | |
| description, google_client | |
| ) | |
| logger.info(f"Generated narration audio") | |
| return ProcessImageResponse( | |
| accessibleImage=accessible_image_base64, | |
| description=description, | |
| narrationURL=narration_url, | |
| ) | |
| async def process_image(request: ProcessImageRequest): | |
| """ | |
| Process an image URL to generate accessible version, description, and narration. | |
| Runs all generation tasks in parallel for optimal performance. | |
| Args: | |
| request: JSON payload containing imageUrl, googleApiKey, and optional elevenlabsApiKey | |
| Returns: | |
| JSON response with accessibleImage, description, and optional narrationURL | |
| """ | |
| logger.info(f"Processing image from URL: {request.imageUrl}") | |
| try: | |
| image = await asyncio.to_thread(download_image, request.imageUrl) | |
| return await _process(request.googleApiKey, request.elevenlabsApiKey, image) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}") | |
| async def process_image_upload( | |
| image: UploadFile = File(...), | |
| googleApiKey: str = None, | |
| elevenlabsApiKey: Optional[str] = None, | |
| ): | |
| """ | |
| Process an uploaded image file to generate accessible version, description, and narration. | |
| Args: | |
| image: Uploaded image file | |
| googleApiKey: Google API key (required) | |
| elevenlabsApiKey: ElevenLabs API key (optional) | |
| Returns: | |
| JSON response with accessibleImage, description, and optional narrationURL | |
| """ | |
| try: | |
| if not googleApiKey: | |
| raise HTTPException(status_code=400, detail="googleApiKey is required") | |
| if not image.content_type.startswith("image/"): | |
| raise HTTPException(status_code=400, detail="File must be an image") | |
| image_data = await image.read() | |
| try: | |
| pil_image = Image.open(io.BytesIO(image_data)) | |
| pil_image.verify() # Verify it's a valid image | |
| except Exception as e: | |
| raise HTTPException(status_code=400, detail=f"Invalid image file: {str(e)}") | |
| pil_image = Image.open(io.BytesIO(image_data)) | |
| buffer = io.BytesIO() | |
| pil_image.save(buffer, format="PNG") | |
| buffer.seek(0) | |
| original_image = types.Part.from_bytes( | |
| data=buffer.getvalue(), mime_type="image/png" | |
| ) | |
| return await _process(googleApiKey, elevenlabsApiKey, original_image) | |
| except HTTPException: | |
| # Re-raise HTTP exceptions | |
| raise | |
| except Exception as e: | |
| # Catch any other errors | |
| raise HTTPException(status_code=500, detail=f"Error processing image: {str(e)}") | |
| async def health_check(): | |
| """Health check endpoint""" | |
| return {"status": "healthy", "message": "Accessibility Service API is running"} | |