chat-frontend/backend/app/api/chats.py

from typing import List, Optional
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
from sqlalchemy.orm import Session as DBSession
from sqlalchemy import select, desc

from app.db.session import get_db
from app.db.models import User, Chat, Message
from app.api.deps import get_current_user
from app.core.models_catalog import AVAILABLE_MODELS, ModelInfo
import logging
import sys

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
    logger.addHandler(handler)

router = APIRouter()

class ChatCreateRequest(BaseModel):
    title: str = "New Chat"
    model_alias: str

class ChatResponse(BaseModel):
    id: str
    title: str
    model_alias: str
    created_at: datetime
    updated_at: datetime

class MessageCreateRequest(BaseModel):
    content: str
    role: str = "user"

class MessageResponse(BaseModel):
    id: str
    role: str
    content: str
    created_at: datetime

@router.get("/models", response_model=List[ModelInfo])
def get_models(user: User = Depends(get_current_user)):
    return AVAILABLE_MODELS

@router.post("/chats", response_model=ChatResponse)
def create_chat(
    req: ChatCreateRequest,
    db: DBSession = Depends(get_db),
    user: User = Depends(get_current_user)
):
    valid_aliases = {m.alias for m in AVAILABLE_MODELS}
    if req.model_alias not in valid_aliases:
        raise HTTPException(status_code=400, detail="Invalid model alias")

    chat = Chat(user_id=user.id, title=req.title, model_alias=req.model_alias)
    db.add(chat)
    db.commit()
    db.refresh(chat)

    return chat

@router.get("/chats", response_model=List[ChatResponse])
def list_chats(
    db: DBSession = Depends(get_db),
    user: User = Depends(get_current_user)
):
    stmt = select(Chat).where(Chat.user_id == user.id).order_by(desc(Chat.updated_at))
    chats = db.scalars(stmt).all()
    return chats

@router.get("/chats/{chat_id}", response_model=ChatResponse)
def get_chat(
    chat_id: str,
    db: DBSession = Depends(get_db),
    user: User = Depends(get_current_user)
):
    chat = db.get(Chat, chat_id)
    if not chat or chat.user_id != user.id:
        raise HTTPException(status_code=404, detail="Chat not found")
    return chat

@router.delete("/chats/{chat_id}")
def delete_chat(
    chat_id: str,
    db: DBSession = Depends(get_db),
    user: User = Depends(get_current_user)
):
    chat = db.get(Chat, chat_id)
    if not chat or chat.user_id != user.id:
        raise HTTPException(status_code=404, detail="Chat not found")

    db.delete(chat)
    db.commit()
    return {"status": "ok"}

@router.get("/chats/{chat_id}/messages", response_model=List[MessageResponse])
def list_messages(
    chat_id: str,
    db: DBSession = Depends(get_db),
    user: User = Depends(get_current_user)
):
    chat = db.get(Chat, chat_id)
    if not chat or chat.user_id != user.id:
        raise HTTPException(status_code=404, detail="Chat not found")

    stmt = select(Message).where(Message.chat_id == chat_id).order_by(Message.created_at)
    messages = db.scalars(stmt).all()
    return messages

from app.core.llm_client import llm_client, inference_lock

def sanitize_llm_text(raw_text: Optional[str]) -> Optional[str]:
    if not raw_text:
        return None
    text = raw_text.strip()
    if not text:
        return None

    cleaned = text.replace("<reasoning>", "").replace("</reasoning>", "").strip()
    if not cleaned:
        return None

    return cleaned

def normalize_llm_response(content: str, reasoning: str) -> Optional[str]:
    c_sanitized = sanitize_llm_text(content)
    if c_sanitized:
        return c_sanitized

    r_sanitized = sanitize_llm_text(reasoning)
    if r_sanitized:
        return r_sanitized

    return None

@router.post("/chats/{chat_id}/messages", response_model=List[MessageResponse])
async def add_message(
    chat_id: str,
    req: MessageCreateRequest,
    db: DBSession = Depends(get_db),
    user: User = Depends(get_current_user)
):
    chat = db.get(Chat, chat_id)
    if not chat or chat.user_id != user.id:
        raise HTTPException(status_code=404, detail="Chat not found")

    # 1. Save user message
    user_msg = Message(chat_id=chat.id, role=req.role, content=req.content)
    db.add(user_msg)

    from datetime import datetime, timezone
    chat.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
    db.add(chat)
    db.commit()
    db.refresh(user_msg)

    logger.info(f"User message saved for chat {chat.id}. Selected model: {chat.model_alias}")

    # 2. Fetch recent chat history to assemble prompt
    # Get last 20 messages
    stmt = select(Message).where(Message.chat_id == chat_id).order_by(desc(Message.created_at)).limit(20)
    recent_msgs = db.scalars(stmt).all()
    recent_msgs.reverse()

    llm_history = []
    for m in recent_msgs:
        llm_history.append({"role": m.role, "content": m.content})

    # 3. Enter Critical Section for LLM Switch and Inference
    ai_response = None
    final_content = None
    async with inference_lock:
        try:
            status_data = await llm_client.get_status()
            current_model = status_data.get("active_model")
            logger.info(f"Current active llm-manager model: {current_model}")

            # Switch if needed
            switched = (current_model != chat.model_alias)
            if switched:
                logger.info(f"Switching model to {chat.model_alias}... (switch requested)")
                await llm_client.switch_model(chat.model_alias)
                logger.info(f"Successfully requested switch to {chat.model_alias}. Waiting for readiness...")

            # Wait for readiness
            is_ready, iterations, final_status = await llm_client.wait_for_model_ready(
                model_name=chat.model_alias,
                timeout=60.0,
                poll_interval=2.0
            )

            if not is_ready:
                logger.error(f"Readiness timeout for {chat.model_alias} after {iterations} iterations. Final status: {final_status}")
                raise HTTPException(status_code=504, detail=f"LLM Manager readiness timeout for {chat.model_alias}")

            logger.info(f"Model {chat.model_alias} is ready after {iterations} iterations. Final status before completion: {final_status}")

            async def do_completion(msgs, max_tok=None, temp=None):
                try:
                    return await llm_client.chat_completion(messages=msgs, max_tokens=max_tok, temperature=temp)
                except HTTPException as e:
                    if e.status_code == 502 or "503" in str(e.detail):
                        logger.warning("Generation failed (possibly 503 unloading). Retrying switch and completion...")
                        await llm_client.switch_model(chat.model_alias)
                        return await llm_client.chat_completion(messages=msgs, max_tokens=max_tok, temperature=temp)
                    raise e

            # Call inference (Attempt 1)
            logger.info("Starting chat completion (Attempt 1)...")
            ai_response = await do_completion(llm_history)

            # Parse Attempt 1
            ai_choice = ai_response.get("choices", [{}])[0].get("message", {})
            ai_content_raw = ai_choice.get("content", "") or ""
            ai_reasoning_raw = ai_choice.get("reasoning_content", "") or ""

            c_san = sanitize_llm_text(ai_content_raw)
            r_san = sanitize_llm_text(ai_reasoning_raw)
            final_content = normalize_llm_response(ai_content_raw, ai_reasoning_raw)

            logger.info(
                f"LLM Stats (Attempt 1) | model: {chat.model_alias} | "
                f"switched: {switched} | "
                f"content_raw_len: {len(ai_content_raw)} | reasoning_raw_len: {len(ai_reasoning_raw)} | "
                f"content_san_len: {len(c_san) if c_san else 0} | reasoning_san_len: {len(r_san) if r_san else 0}"
            )

            if not final_content:
                logger.warning("Attempt 1 rejected: invalid response (both sanitized texts are empty). Triggering controlled retry.")

                retry_history = list(llm_history)
                retry_history.append({
                    "role": "user",
                    "content": "Ответь сразу финальным текстом. Не выводи reasoning, chain-of-thought, XML-теги или служебную разметку."
                })

                logger.info("Starting chat completion (Attempt 2 - Retry) with max_tokens=2048 and temperature=0.1...")
                ai_response_retry = await do_completion(retry_history, max_tok=2048, temp=0.1)

                ai_choice_r = ai_response_retry.get("choices", [{}])[0].get("message", {})
                ai_content_r_raw = ai_choice_r.get("content", "") or ""
                ai_reasoning_r_raw = ai_choice_r.get("reasoning_content", "") or ""

                c_san_r = sanitize_llm_text(ai_content_r_raw)
                r_san_r = sanitize_llm_text(ai_reasoning_r_raw)
                final_content = normalize_llm_response(ai_content_r_raw, ai_reasoning_r_raw)

                logger.info(
                    f"LLM Stats (Attempt 2 - Retry) | model: {chat.model_alias} | "
                    f"content_raw_len: {len(ai_content_r_raw)} | reasoning_raw_len: {len(ai_reasoning_r_raw)} | "
                    f"content_san_len: {len(c_san_r) if c_san_r else 0} | reasoning_san_len: {len(r_san_r) if r_san_r else 0}"
                )

                if not final_content:
                    logger.error("Attempt 2 also failed to produce valid output. Aborting.")
                    raise HTTPException(status_code=500, detail="LLM failed to produce valid output after retry.")
                else:
                    logger.info("Attempt 2 succeeded in producing valid output.")
            else:
                if not ai_content_raw.strip() and final_content:
                    logger.info("Fallback to reasoning_content was chosen because 'content' was empty (Attempt 1).")

        except HTTPException:
            raise
        except Exception as e:
            logger.error(f"Inference pipeline failed: {e}")
            raise HTTPException(status_code=500, detail=str(e))

    # 5. Save AI message
    assistant_msg = Message(chat_id=chat.id, role="assistant", content=final_content)
    db.add(assistant_msg)

    chat.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
    db.add(chat)
    db.commit()
    db.refresh(assistant_msg)
    logger.info("Assistant message saved successfully.")

    return [user_msg, assistant_msg]