From 497cc3846a4b33751bd0122d0c68e1e50b1aa53f Mon Sep 17 00:00:00 2001 From: adityajunwal Date: Wed, 31 Dec 2025 14:51:25 +0530 Subject: [PATCH] OpenAI Speech to Text --- src/routes/v2/modelRouter.py | 6 +- .../commonServices/baseService/baseService.py | 20 ++++- src/services/commonServices/common.py | 80 +++++++++++++++++++ .../commonServices/openAI/audio_model.py | 68 ++++++++++++++++ .../commonServices/openAI/openai_response.py | 15 ++++ src/services/utils/ai_middleware_format.py | 19 ++++- src/services/utils/common_utils.py | 4 +- 7 files changed, 208 insertions(+), 4 deletions(-) create mode 100644 src/services/commonServices/openAI/audio_model.py diff --git a/src/routes/v2/modelRouter.py b/src/routes/v2/modelRouter.py index a329bb46..8023ea07 100644 --- a/src/routes/v2/modelRouter.py +++ b/src/routes/v2/modelRouter.py @@ -1,7 +1,7 @@ from fastapi import APIRouter, Depends, Request, HTTPException import asyncio -from src.services.commonServices.common import chat_multiple_agents, embedding, batch, image, orchestrator_chat +from src.services.commonServices.common import chat_multiple_agents, embedding, batch, image, orchestrator_chat, audio from src.services.commonServices.baseService.utils import make_request_data from ...middlewares.middleware import jwt_middleware from ...middlewares.getDataUsingBridgeId import add_configuration_data_to_body @@ -50,6 +50,10 @@ async def chat_completion(request: Request, db_config: dict = Depends(add_config loop = asyncio.get_event_loop() result = await image(data_to_send) return result + if type == 'audio': + loop = asyncio.get_event_loop() + result = await audio(data_to_send) + return result # Always use chat_multiple_agents - it handles both single and multiple agents loop = asyncio.get_event_loop() result = await chat_multiple_agents(data_to_send) diff --git a/src/services/commonServices/baseService/baseService.py b/src/services/commonServices/baseService/baseService.py index 9860b232..a3846350 100644 --- a/src/services/commonServices/baseService/baseService.py +++ b/src/services/commonServices/baseService/baseService.py @@ -16,6 +16,7 @@ from ..openRouter.openRouter_modelrun import openrouter_modelrun from ....configs.constant import service_name from ..openAI.image_model import OpenAIImageModel +from ..openAI.audio_model import OpenAIAudioModel from ..Google.gemini_image_model import gemini_image_model from ..Google.gemini_video_model import gemini_video_model from ..AiMl.ai_ml_model_run import ai_ml_model_run @@ -70,12 +71,14 @@ def __init__(self, params): self.function_time_logs = params.get('function_time_logs') self.files = params.get('files') or [] self.file_data = params.get('file_data') + self.audio_file = params.get('audio_file') self.youtube_url = params.get('youtube_url') self.web_search_filters = params.get('web_search_filters') self.folder_id = params.get('folder_id') self.bridge_configurations = params.get('bridge_configurations') + def aiconfig(self): return self.customConfig @@ -378,4 +381,19 @@ async def video(self, configuration, apikey, service): } except Exception as e: logger.error(f"chats error in video=>, {str(e)}, {traceback.format_exc()}") - raise ValueError(f"error occurs from {self.service} api {e.args[0]}") \ No newline at end of file + raise ValueError(f"error occurs from {self.service} api {e.args[0]}") + + async def audio(self, configuration, apikey, service, audio_file): + try: + response = {} + if service == service_name['openai']: + response = await OpenAIAudioModel(configuration, apikey, self.execution_time_logs, self.timer, audio_file) + if not response['success']: + raise ValueError(response['error']) + return { + 'success': True, + 'modelResponse': response['response'] + } + except Exception as e: + logger.error(f"chats error in audio=>, {str(e)}, {traceback.format_exc()}") + raise ValueError(f"error occurs from {self.service} api {e.args[0]}") diff --git a/src/services/commonServices/common.py b/src/services/commonServices/common.py index f6f03ba3..50ec7de8 100644 --- a/src/services/commonServices/common.py +++ b/src/services/commonServices/common.py @@ -630,3 +630,83 @@ async def image(request_body): # Process background tasks for error handling await process_background_tasks_for_error(parsed_data, error) raise ValueError(error) + +@handle_exceptions +async def audio(request_body): + result = {} + class_obj = {} + try: + # Store bridge_configurations for potential transfer logic + bridge_configurations = request_body.get('body', {}).get('bridge_configurations', {}) + + # Step 1: Parse and validate request body + parsed_data = parse_request_body(request_body) + + # Initialize or retrieve transfer_request_id for tracking transfers + transfer_request_id = parsed_data.get('transfer_request_id') or str(uuid.uuid1()) + parsed_data['transfer_request_id'] = transfer_request_id + + # Initialize transfer history for this request if not exists + if transfer_request_id not in TRANSFER_HISTORY: + TRANSFER_HISTORY[transfer_request_id] = [] + + # Step 2: Initialize Timer + timer = initialize_timer(parsed_data['state']) + + # Step 3: Load Model Configuration + model_config, custom_config, model_output_config = await load_model_configuration( + parsed_data['model'], parsed_data['configuration'], parsed_data['service'], + ) + + # Step 4: Configure Custom Settings + custom_config = await configure_custom_settings( + model_config['configuration'], custom_config, parsed_data['service'] + ) + # Step 5: Manage Threads + thread_info = await manage_threads(parsed_data) + + # Step 6: Execute Service Handler + params = build_service_params( + parsed_data, custom_config, model_output_config, thread_info, timer, None, send_error_to_webhook, bridge_configurations + ) + + class_obj = await Helper.create_service_handler(params, parsed_data['service']) + result = await class_obj.execute() + + if not result["success"]: + raise ValueError(result) + + # Create latency object using utility function + latency = create_latency_object(timer, params) + if not parsed_data['is_playground']: + if result.get('response') and result['response'].get('data'): + result['response']['data']['id'] = parsed_data['message_id'] + await sendResponse(parsed_data['response_format'], result["response"], success=True, variables=parsed_data.get('variables',{})) + # Update usage metrics for successful API calls + update_usage_metrics(parsed_data, params, latency, result=result, success=True) + # Process background tasks (handles both transfer and non-transfer cases) + await process_background_tasks(parsed_data, result, params, thread_info, transfer_request_id, bridge_configurations) + return JSONResponse(status_code=200, content={"success": True, "response": result["response"]}) + + except (Exception, ValueError, BadRequestException) as error: + if not isinstance(error, BadRequestException): + logger.error(f'Error in audio service: {str(error)}, {traceback.format_exc()}') + if not parsed_data['is_playground']: + # Update parsed_data with thread_info if available and thread_id/sub_thread_id are None + if 'thread_info' in locals() and thread_info: + if not parsed_data.get('thread_id') and thread_info.get('thread_id'): + parsed_data['thread_id'] = thread_info['thread_id'] + if not parsed_data.get('sub_thread_id') and thread_info.get('sub_thread_id'): + parsed_data['sub_thread_id'] = thread_info['sub_thread_id'] + + # Create latency object and update usage metrics + latency = create_latency_object(timer, params) if 'params' in locals() and params else None + if latency: + update_usage_metrics(parsed_data, params, latency, error=error, success=False) + + # Create history parameters + parsed_data['historyParams'] = create_history_params(parsed_data, error, class_obj, thread_info if 'thread_info' in locals() else None) + await sendResponse(parsed_data['response_format'], result.get("modelResponse", str(error)), variables=parsed_data['variables']) if parsed_data['response_format']['type'] != 'default' else None + # Process background tasks for error handling + await process_background_tasks_for_error(parsed_data, error) + raise ValueError(error) diff --git a/src/services/commonServices/openAI/audio_model.py b/src/services/commonServices/openAI/audio_model.py new file mode 100644 index 00000000..a3065fb1 --- /dev/null +++ b/src/services/commonServices/openAI/audio_model.py @@ -0,0 +1,68 @@ +import traceback +from openai import AsyncOpenAI +import base64 +import io + + +async def OpenAIAudioModel(configuration, apiKey, execution_time_logs, timer, audio_file): + try: + openai_config = AsyncOpenAI(api_key=apiKey) + timer.start() + + model = configuration.get('model') + + # Handle base64 encoded audio - strictly require data:audio format + if isinstance(audio_file, str): + if not audio_file.startswith('data:audio'): + raise ValueError("Audio file must be in 'data:audio' format (e.g., 'data:audio/mp3;base64,...')") + + # Extract base64 data after the comma + base64_data = audio_file.split(',', 1)[1] + audio_bytes = base64.b64decode(base64_data) + audio_file = io.BytesIO(audio_bytes) + audio_file.name = "audio.mp3" # OpenAI requires a filename + + # Process transcription + chat_completion = await openai_config.audio.transcriptions.create( + model=model, + file=audio_file, + ) + execution_time_logs.append({ + "step": "OpenAI audio transcription processing time", + "time_taken": timer.stop("OpenAI audio transcription processing time") + }) + + # Extract token-based usage from newer audio models + usage_data = {} + if hasattr(chat_completion, 'usage') and chat_completion.usage: + usage = chat_completion.usage + if hasattr(usage, 'input_tokens'): + usage_data = { + "input_tokens": usage.input_tokens, + "input_token_details": dict(usage.input_token_details) if hasattr(usage, 'input_token_details') else {}, + "output_tokens": usage.output_tokens, + "total_tokens": usage.total_tokens + } + + response = { + 'text': chat_completion.text, + 'operation': 'transcription', + 'model': model, + 'usage': usage_data + } + + return { + 'success': True, + 'response': response + } + except Exception as error: + execution_time_logs.append({ + "step": "OpenAI audio processing time", + "time_taken": timer.stop("OpenAI audio processing time") + }) + print("audio model error=>", error) + traceback.print_exc() + return { + 'success': False, + 'error': str(error) + } diff --git a/src/services/commonServices/openAI/openai_response.py b/src/services/commonServices/openAI/openai_response.py index a4f8b364..f5fc3af3 100644 --- a/src/services/commonServices/openAI/openai_response.py +++ b/src/services/commonServices/openAI/openai_response.py @@ -23,6 +23,21 @@ async def execute(self): historyParams = self.prepare_history_params(response, modelResponse, tools, None) historyParams['message'] = "image generated successfully" historyParams['type'] = 'assistant' + elif self.type == 'audio': + # Audio handling for transcription/translation + audio_file = self.audio_file # Use dedicated audio_file field + openAIResponse = await self.audio(self.customConfig, self.apikey, service_name['openai'], audio_file) + modelResponse = openAIResponse.get("modelResponse", {}) + if not openAIResponse.get('success'): + if not self.playground: + await self.handle_failure(openAIResponse) + raise ValueError(openAIResponse.get('error')) + response = await Response_formatter(modelResponse, service_name['openai'], tools, self.type, None) + if not self.playground: + historyParams = self.prepare_history_params(response, modelResponse, tools, None) + operation = modelResponse.get('operation', 'audio processing') + historyParams['message'] = f"audio {operation} completed successfully" + historyParams['type'] = 'assistant' else: conversation = ConversationService.createOpenAiConversation(self.configuration.get('conversation'), self.memory, self.files).get('messages', []) developer = [{"role": "developer", "content": self.configuration['prompt']}] if not self.reasoning_model else [] diff --git a/src/services/utils/ai_middleware_format.py b/src/services/utils/ai_middleware_format.py index 00799485..fd097bdb 100644 --- a/src/services/utils/ai_middleware_format.py +++ b/src/services/utils/ai_middleware_format.py @@ -34,7 +34,7 @@ async def Response_formatter(response = {}, service = None, tools={}, type='chat } } - elif service == service_name['openai'] and (type != 'image' and type != 'embedding'): + elif service == service_name['openai'] and (type != 'image' and type != 'embedding' and type != 'audio'): return { "data": { "id": response.get("id", None), @@ -128,6 +128,23 @@ async def Response_formatter(response = {}, service = None, tools={}, type='chat "file_data" : response.get('data')[0].get('file_reference') } } + elif service == service_name['openai'] and type == 'audio': + return { + "data": { + "id": None, + "content": response.get('text'), + "model": response.get('model'), + "role": "assistant", + "finish_reason": "completed", + "operation": response.get('operation') + }, + "usage": { + "input_tokens": response.get('usage', {}).get('input_tokens'), + "input_token_details": response.get('usage', {}).get('input_token_details'), + "output_tokens": response.get('usage', {}).get('output_tokens'), + "total_tokens": response.get('usage', {}).get('total_tokens'), + } + } elif service == service_name['openai']: image_urls = [] for image_data in response.get('data', []): diff --git a/src/services/utils/common_utils.py b/src/services/utils/common_utils.py index f7925d79..043e360a 100644 --- a/src/services/utils/common_utils.py +++ b/src/services/utils/common_utils.py @@ -184,7 +184,8 @@ def parse_request_body(request_body): "testcase_data" : body.get('testcase_data') or {}, "is_embed" : body.get('is_embed'), "user_id" : body.get('user_id'), - "file_data" : body.get('video_data') or {}, + "file_data" : body.get('file_data') or body.get('video_data') or {}, + "audio_file" : body.get('audio_file') or {}, "youtube_url" : body.get('youtube_url') or None, "folder_id": body.get('folder_id'), "web_search_filters" : body.get('web_search_filters') or None, @@ -423,6 +424,7 @@ def build_service_params(parsed_data, custom_config, model_output_config, thread "built_in_tools" : parsed_data['built_in_tools'], "files" : parsed_data['files'], "file_data" : parsed_data['file_data'], + "audio_file" : parsed_data['audio_file'], "youtube_url" : parsed_data['youtube_url'], "web_search_filters" : parsed_data['web_search_filters'], "folder_id": parsed_data.get('folder_id'),