diff --git a/Dockerfile b/Dockerfile index 6f6f3bc..4d9c7e9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,18 +5,14 @@ FROM python:3.11-slim ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 -# Set work directory WORKDIR /app -# Install dependencies -# We do this before copying the whole app to leverage Docker cache COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt -# Copy the rest of the application code COPY . . -# Expose the port your app runs on EXPOSE 5001 -CMD ["gunicorn", "--bind", "0.0.0.0:5001", "app:app"] \ No newline at end of file +# Increase timeout to 5 minutes (300s) and use threads to handle long waits +CMD ["gunicorn", "--bind", "0.0.0.0:5001", "--timeout", "300", "--worker-class", "gthread", "--threads", "4", "app:app"] \ No newline at end of file diff --git a/app.py b/app.py index 2ef09cb..20e54f4 100644 --- a/app.py +++ b/app.py @@ -27,32 +27,34 @@ def about(): def proxy_chat(): target_url = "http://192.168.0.37:5002/v1/chat/completions" + # Ensure 'stream' is set to True for the backend + payload = request.json + payload['stream'] = True + try: - # We use stream=True so we don't load the whole response into RAM at once + # We use stream=True so requests doesn't buffer the whole response response = requests.post( target_url, - json=request.json, - timeout=300, + json=payload, + timeout=300, stream=True ) - # Generator to yield chunks of data as they arrive def generate(): - for chunk in response.iter_content(chunk_size=1024): - yield chunk + # This yields chunks of data to the browser as they arrive + for chunk in response.iter_content(chunk_size=None): + if chunk: + yield chunk return Response( generate(), - status=response.status_code, - content_type=response.headers.get('content-type', 'application/json') + content_type='text/event-stream' # Standard for streaming AI responses ) except requests.exceptions.Timeout: - return {"error": "The backend LLM timed out."}, 504 + return {"error": "Backend timed out"}, 504 except Exception as e: - app.logger.error(f"Proxy error: {str(e)}") - return {"error": "Internal server error"}, 500 - + return {"error": str(e)}, 500 @app.route('/post//') def post(path): page = pages.get_or_404(path)