from typing import List, Optional from datetime import datetime from fastapi import APIRouter, Depends, HTTPException, status from pydantic import BaseModel from sqlalchemy.orm import Session as DBSession from sqlalchemy import select, desc from app.db.session import get_db from app.db.models import User, Chat, Message from app.api.deps import get_current_user from app.core.models_catalog import AVAILABLE_MODELS, ModelInfo import logging import sys logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) if not logger.handlers: handler = logging.StreamHandler(sys.stdout) handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) logger.addHandler(handler) router = APIRouter() class ChatCreateRequest(BaseModel): title: str = "New Chat" model_alias: str class ChatResponse(BaseModel): id: str title: str model_alias: str created_at: datetime updated_at: datetime class MessageCreateRequest(BaseModel): content: str role: str = "user" class MessageResponse(BaseModel): id: str role: str content: str created_at: datetime @router.get("/models", response_model=List[ModelInfo]) def get_models(user: User = Depends(get_current_user)): return AVAILABLE_MODELS @router.post("/chats", response_model=ChatResponse) def create_chat( req: ChatCreateRequest, db: DBSession = Depends(get_db), user: User = Depends(get_current_user) ): valid_aliases = {m.alias for m in AVAILABLE_MODELS} if req.model_alias not in valid_aliases: raise HTTPException(status_code=400, detail="Invalid model alias") chat = Chat(user_id=user.id, title=req.title, model_alias=req.model_alias) db.add(chat) db.commit() db.refresh(chat) return chat @router.get("/chats", response_model=List[ChatResponse]) def list_chats( db: DBSession = Depends(get_db), user: User = Depends(get_current_user) ): stmt = select(Chat).where(Chat.user_id == user.id).order_by(desc(Chat.updated_at)) chats = db.scalars(stmt).all() return chats @router.get("/chats/{chat_id}", response_model=ChatResponse) def get_chat( chat_id: str, db: DBSession = Depends(get_db), user: User = Depends(get_current_user) ): chat = db.get(Chat, chat_id) if not chat or chat.user_id != user.id: raise HTTPException(status_code=404, detail="Chat not found") return chat @router.delete("/chats/{chat_id}") def delete_chat( chat_id: str, db: DBSession = Depends(get_db), user: User = Depends(get_current_user) ): chat = db.get(Chat, chat_id) if not chat or chat.user_id != user.id: raise HTTPException(status_code=404, detail="Chat not found") db.delete(chat) db.commit() return {"status": "ok"} @router.get("/chats/{chat_id}/messages", response_model=List[MessageResponse]) def list_messages( chat_id: str, db: DBSession = Depends(get_db), user: User = Depends(get_current_user) ): chat = db.get(Chat, chat_id) if not chat or chat.user_id != user.id: raise HTTPException(status_code=404, detail="Chat not found") stmt = select(Message).where(Message.chat_id == chat_id).order_by(Message.created_at) messages = db.scalars(stmt).all() return messages from app.core.llm_client import llm_client, inference_lock def sanitize_llm_text(raw_text: Optional[str]) -> Optional[str]: if not raw_text: return None text = raw_text.strip() if not text: return None cleaned = text.replace("", "").replace("", "").strip() if not cleaned: return None return cleaned def normalize_llm_response(content: str, reasoning: str) -> Optional[str]: c_sanitized = sanitize_llm_text(content) if c_sanitized: return c_sanitized r_sanitized = sanitize_llm_text(reasoning) if r_sanitized: return r_sanitized return None @router.post("/chats/{chat_id}/messages", response_model=List[MessageResponse]) async def add_message( chat_id: str, req: MessageCreateRequest, db: DBSession = Depends(get_db), user: User = Depends(get_current_user) ): chat = db.get(Chat, chat_id) if not chat or chat.user_id != user.id: raise HTTPException(status_code=404, detail="Chat not found") # 1. Save user message user_msg = Message(chat_id=chat.id, role=req.role, content=req.content) db.add(user_msg) from datetime import datetime, timezone chat.updated_at = datetime.now(timezone.utc).replace(tzinfo=None) db.add(chat) db.commit() db.refresh(user_msg) logger.info(f"User message saved for chat {chat.id}. Selected model: {chat.model_alias}") # 2. Fetch recent chat history to assemble prompt # Get last 20 messages stmt = select(Message).where(Message.chat_id == chat_id).order_by(desc(Message.created_at)).limit(20) recent_msgs = db.scalars(stmt).all() recent_msgs.reverse() llm_history = [] for m in recent_msgs: llm_history.append({"role": m.role, "content": m.content}) # 3. Enter Critical Section for LLM Switch and Inference ai_response = None final_content = None async with inference_lock: try: status_data = await llm_client.get_status() current_model = status_data.get("active_model") logger.info(f"Current active llm-manager model: {current_model}") # Switch if needed switched = (current_model != chat.model_alias) if switched: logger.info(f"Switching model to {chat.model_alias}... (switch requested)") await llm_client.switch_model(chat.model_alias) logger.info(f"Successfully requested switch to {chat.model_alias}. Waiting for readiness...") # Wait for readiness is_ready, iterations, final_status = await llm_client.wait_for_model_ready( model_name=chat.model_alias, timeout=60.0, poll_interval=2.0 ) if not is_ready: logger.error(f"Readiness timeout for {chat.model_alias} after {iterations} iterations. Final status: {final_status}") raise HTTPException(status_code=504, detail=f"LLM Manager readiness timeout for {chat.model_alias}") logger.info(f"Model {chat.model_alias} is ready after {iterations} iterations. Final status before completion: {final_status}") async def do_completion(msgs, max_tok=None, temp=None): try: return await llm_client.chat_completion(messages=msgs, max_tokens=max_tok, temperature=temp) except HTTPException as e: if e.status_code == 502 or "503" in str(e.detail): logger.warning("Generation failed (possibly 503 unloading). Retrying switch and completion...") await llm_client.switch_model(chat.model_alias) return await llm_client.chat_completion(messages=msgs, max_tokens=max_tok, temperature=temp) raise e # Call inference (Attempt 1) logger.info("Starting chat completion (Attempt 1)...") ai_response = await do_completion(llm_history) # Parse Attempt 1 ai_choice = ai_response.get("choices", [{}])[0].get("message", {}) ai_content_raw = ai_choice.get("content", "") or "" ai_reasoning_raw = ai_choice.get("reasoning_content", "") or "" c_san = sanitize_llm_text(ai_content_raw) r_san = sanitize_llm_text(ai_reasoning_raw) final_content = normalize_llm_response(ai_content_raw, ai_reasoning_raw) logger.info( f"LLM Stats (Attempt 1) | model: {chat.model_alias} | " f"switched: {switched} | " f"content_raw_len: {len(ai_content_raw)} | reasoning_raw_len: {len(ai_reasoning_raw)} | " f"content_san_len: {len(c_san) if c_san else 0} | reasoning_san_len: {len(r_san) if r_san else 0}" ) if not final_content: logger.warning("Attempt 1 rejected: invalid response (both sanitized texts are empty). Triggering controlled retry.") retry_history = list(llm_history) retry_history.append({ "role": "user", "content": "Ответь сразу финальным текстом. Не выводи reasoning, chain-of-thought, XML-теги или служебную разметку." }) logger.info("Starting chat completion (Attempt 2 - Retry) with max_tokens=2048 and temperature=0.1...") ai_response_retry = await do_completion(retry_history, max_tok=2048, temp=0.1) ai_choice_r = ai_response_retry.get("choices", [{}])[0].get("message", {}) ai_content_r_raw = ai_choice_r.get("content", "") or "" ai_reasoning_r_raw = ai_choice_r.get("reasoning_content", "") or "" c_san_r = sanitize_llm_text(ai_content_r_raw) r_san_r = sanitize_llm_text(ai_reasoning_r_raw) final_content = normalize_llm_response(ai_content_r_raw, ai_reasoning_r_raw) logger.info( f"LLM Stats (Attempt 2 - Retry) | model: {chat.model_alias} | " f"content_raw_len: {len(ai_content_r_raw)} | reasoning_raw_len: {len(ai_reasoning_r_raw)} | " f"content_san_len: {len(c_san_r) if c_san_r else 0} | reasoning_san_len: {len(r_san_r) if r_san_r else 0}" ) if not final_content: logger.error("Attempt 2 also failed to produce valid output. Aborting.") raise HTTPException(status_code=500, detail="LLM failed to produce valid output after retry.") else: logger.info("Attempt 2 succeeded in producing valid output.") else: if not ai_content_raw.strip() and final_content: logger.info("Fallback to reasoning_content was chosen because 'content' was empty (Attempt 1).") except HTTPException: raise except Exception as e: logger.error(f"Inference pipeline failed: {e}") raise HTTPException(status_code=500, detail=str(e)) # 5. Save AI message assistant_msg = Message(chat_id=chat.id, role="assistant", content=final_content) db.add(assistant_msg) chat.updated_at = datetime.now(timezone.utc).replace(tzinfo=None) db.add(chat) db.commit() db.refresh(assistant_msg) logger.info("Assistant message saved successfully.") return [user_msg, assistant_msg]