converted llm to non streaming

2025-12-22 17:00:12 +01:00
parent ea43c144e0
commit c9c9610de9
1 changed files with 21 additions and 11 deletions
--- a/app.py
+++ b/app.py
@@ -23,21 +23,31 @@ def index():
@app.route('/about')
 def about():
    return render_template('about.html')
-
@app.route('/proxy-chat', methods=['POST'])
 def proxy_chat():
    target_url = "http://192.168.0.37:5002/v1/chat/completions"
    
-    # Forward the request to your local LLM
-    # We use stream=True here to get the chunks from the backend
-    req = requests.post(target_url, json=request.json, stream=True)
-
-    def generate():
-        # Yield each chunk as it arrives from the LLM
-        for chunk in req.iter_content(chunk_size=1024):
-            yield chunk
-
-    return Response(stream_with_context(generate()), content_type=req.headers['content-type'])
+    try:
+        # 1. Forward the request without stream=True
+        # llama.cpp will now send back one large JSON object
+        response = requests.post(
+            target_url, 
+            json=request.json, 
+            timeout=120  # Important: Give the i5-8400 time to think
+        )
+        
+        # 2. Return the full JSON response to the browser
+        return Response(
+            response.content, 
+            status=response.status_code, 
+            content_type=response.headers['content-type']
+        )
+        
+    except requests.exceptions.Timeout:
+        return {"error": "The serve took too long to answer. Try a different prompt."}, 504
+    except Exception as e:
+        return {"error": str(e)}, 500
+    

@app.route('/post/<path:path>/') # Adding /post/ prefix helps organize URLs
 def post(path):