diff --git a/app.py b/app.py index f29d605..a9fcb02 100644 --- a/app.py +++ b/app.py @@ -23,21 +23,31 @@ def index(): @app.route('/about') def about(): return render_template('about.html') - @app.route('/proxy-chat', methods=['POST']) def proxy_chat(): target_url = "http://192.168.0.37:5002/v1/chat/completions" - # Forward the request to your local LLM - # We use stream=True here to get the chunks from the backend - req = requests.post(target_url, json=request.json, stream=True) - - def generate(): - # Yield each chunk as it arrives from the LLM - for chunk in req.iter_content(chunk_size=1024): - yield chunk - - return Response(stream_with_context(generate()), content_type=req.headers['content-type']) + try: + # 1. Forward the request without stream=True + # llama.cpp will now send back one large JSON object + response = requests.post( + target_url, + json=request.json, + timeout=120 # Important: Give the i5-8400 time to think + ) + + # 2. Return the full JSON response to the browser + return Response( + response.content, + status=response.status_code, + content_type=response.headers['content-type'] + ) + + except requests.exceptions.Timeout: + return {"error": "The serve took too long to answer. Try a different prompt."}, 504 + except Exception as e: + return {"error": str(e)}, 500 + @app.route('/post//') # Adding /post/ prefix helps organize URLs def post(path):