gaurv007 commited on
Commit
333f825
·
verified ·
1 Parent(s): f782685

fix: upload actual analyze/route.ts content — XSS fix, scan count fix, input validation

Browse files
Files changed (1) hide show
  1. web/app/api/analyze/route.ts +233 -1
web/app/api/analyze/route.ts CHANGED
@@ -1 +1,233 @@
1
- file:/app/web_api_analyze_route.ts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { NextRequest, NextResponse } from "next/server";
2
+ import { createClient } from "@/lib/supabase/server";
3
+
4
+ const GRADIO_URL = process.env.CLAUSEGUARD_GRADIO_URL || "https://gaurv007-clauseguard.hf.space";
5
+
6
+ // FIX v4.1: Max text size validation (prevent oversized payloads)
7
+ const MAX_TEXT_LENGTH = 200_000; // 200KB
8
+
9
+ export async function POST(req: NextRequest) {
10
+ try {
11
+ const supabase = await createClient();
12
+ const { data: { user } } = await supabase.auth.getUser();
13
+
14
+ if (!user) {
15
+ return NextResponse.json({ error: "Unauthorized. Please log in to analyze texts." }, { status: 401 });
16
+ }
17
+
18
+ const body = await req.json();
19
+ let { text } = body;
20
+
21
+ if (!text || typeof text !== "string" || text.trim().length < 50) {
22
+ return NextResponse.json(
23
+ { error: "Please provide at least 50 characters of text to analyze." },
24
+ { status: 400 }
25
+ );
26
+ }
27
+
28
+ // FIX v4.1: Input size validation
29
+ if (text.length > MAX_TEXT_LENGTH) {
30
+ return NextResponse.json(
31
+ { error: `Text too long (${(text.length / 1000).toFixed(0)}KB). Maximum is ${MAX_TEXT_LENGTH / 1000}KB.` },
32
+ { status: 400 }
33
+ );
34
+ }
35
+
36
+ // FIX v4.1: REMOVED the XSS sanitization that corrupted contract text.
37
+ // The old code did: text = text.replace(/</g, "&lt;").replace(/>/g, "&gt;");
38
+ // This PERMANENTLY MUTATED the text before analysis, corrupting contracts
39
+ // that contain < or > characters (e.g., "shall not exceed >$10,000").
40
+ // Sanitization should happen at RENDER TIME in the frontend, not at analysis time.
41
+ // The frontend already uses React which auto-escapes HTML in JSX.
42
+
43
+ // Check scan limits — FIX v4.1: query the CORRECT table name
44
+ const { data: profile } = await supabase
45
+ .from("profiles")
46
+ .select("plan, role, analyses_this_month")
47
+ .eq("id", user.id)
48
+ .single();
49
+
50
+ const isAdmin = profile?.role === "admin";
51
+ const plan = profile?.plan || "free";
52
+
53
+ // FIX v4.1: Use analyses_this_month from profiles (already tracked), not a separate count query
54
+ const scanCount = profile?.analyses_this_month ?? 0;
55
+ const limit = isAdmin ? 999999 : plan === "free" ? 10 : 999999;
56
+ if (scanCount >= limit) {
57
+ return NextResponse.json({ error: "Monthly scan limit reached. Please upgrade to Pro." }, { status: 403 });
58
+ }
59
+
60
+ // Step 1: Submit to Gradio Space
61
+ const submitRes = await fetch(`${GRADIO_URL}/gradio_api/call/_analysis_and_index`, {
62
+ method: "POST",
63
+ headers: { "Content-Type": "application/json" },
64
+ body: JSON.stringify({ data: [text] }),
65
+ });
66
+
67
+ if (!submitRes.ok) {
68
+ throw new Error(`Gradio submit failed: ${submitRes.status}`);
69
+ }
70
+
71
+ const { event_id } = await submitRes.json();
72
+ if (!event_id) throw new Error("No event_id from Gradio");
73
+
74
+ // FIX v4.1: Improved SSE polling with proper streaming support
75
+ // Uses exponential backoff instead of fixed 1s intervals
76
+ let resultText = "";
77
+ let attempts = 0;
78
+ const maxAttempts = 90; // 90 seconds max (increased from 60)
79
+ let delay = 500; // Start at 500ms, increase
80
+
81
+ while (attempts < maxAttempts) {
82
+ const resultRes = await fetch(
83
+ `${GRADIO_URL}/gradio_api/call/_analysis_and_index/${event_id}`,
84
+ { headers: { Accept: "text/event-stream" } }
85
+ );
86
+
87
+ resultText = await resultRes.text();
88
+
89
+ if (resultText.includes("event: complete")) break;
90
+ if (resultText.includes("event: error")) {
91
+ const errMatch = resultText.match(/data:\s*(.+)/);
92
+ throw new Error(errMatch ? errMatch[1] : "Analysis failed in backend");
93
+ }
94
+
95
+ await new Promise(r => setTimeout(r, delay));
96
+ delay = Math.min(delay * 1.2, 2000); // Cap at 2s
97
+ attempts++;
98
+ }
99
+
100
+ if (!resultText.includes("event: complete")) {
101
+ throw new Error("Analysis timed out. The backend may be loading models. Please try again in 30 seconds.");
102
+ }
103
+
104
+ // Step 3: Parse the SSE data
105
+ const completeIdx = resultText.indexOf("event: complete");
106
+ const dataIdx = resultText.indexOf("data: ", completeIdx);
107
+ if (dataIdx === -1) throw new Error("No data in response");
108
+
109
+ const dataStr = resultText.substring(dataIdx + 6).trim();
110
+
111
+ let gradioData: any[];
112
+ try {
113
+ gradioData = JSON.parse(dataStr);
114
+ } catch {
115
+ const cleaned = dataStr.replace(/[\x00-\x1f]/g, (ch: string) => {
116
+ if (ch === "\n") return "\\n";
117
+ if (ch === "\r") return "\\r";
118
+ if (ch === "\t") return "\\t";
119
+ return "";
120
+ });
121
+ gradioData = JSON.parse(cleaned);
122
+ }
123
+
124
+ // Step 4: Download the JSON report file (structured data)
125
+ const jsonFileObj = gradioData[8];
126
+ if (!jsonFileObj?.url) {
127
+ throw new Error("No JSON report generated");
128
+ }
129
+
130
+ const jsonRes = await fetch(jsonFileObj.url);
131
+ if (!jsonRes.ok) throw new Error("Failed to download analysis JSON");
132
+ const analysisData = await jsonRes.json();
133
+
134
+ // Step 5: Transform to frontend format
135
+ const riskScore = analysisData.risk?.score ?? 0;
136
+ const grade = analysisData.risk?.grade ?? "A";
137
+ const totalClauses = analysisData.metadata?.total_clauses ?? 0;
138
+ const flaggedCount = analysisData.metadata?.flagged_clauses ?? 0;
139
+
140
+ // Group clauses by text (multiple labels per clause)
141
+ const clauseMap = new Map<string, any>();
142
+ for (const cr of (analysisData.clauses || [])) {
143
+ if (!clauseMap.has(cr.text)) {
144
+ clauseMap.set(cr.text, { text: cr.text, categories: [] });
145
+ }
146
+ clauseMap.get(cr.text)!.categories.push({
147
+ name: cr.label,
148
+ severity: cr.risk,
149
+ confidence: cr.confidence,
150
+ description: cr.description,
151
+ });
152
+ }
153
+ const results = Array.from(clauseMap.values());
154
+
155
+ // FIX v4.1: Parse redlines from structured JSON data instead of fragile HTML regex
156
+ const redlines: any[] = [];
157
+
158
+ // Try to extract redlines from the analysis JSON first (if available)
159
+ if (analysisData.redlines && Array.isArray(analysisData.redlines)) {
160
+ for (const rl of analysisData.redlines) {
161
+ redlines.push({
162
+ clause_label: rl.clause_label || "",
163
+ risk_level: rl.risk_level || "MEDIUM",
164
+ original_text: rl.original_text || "",
165
+ safe_alternative: rl.safe_alternative || "",
166
+ template_alternative: rl.template_alternative || "",
167
+ legal_basis: rl.legal_basis || "",
168
+ consumer_standard: rl.consumer_standard || "",
169
+ tier: rl.tier || "template",
170
+ });
171
+ }
172
+ }
173
+
174
+ // Fallback: try parsing from HTML only if no structured data
175
+ if (redlines.length === 0) {
176
+ const redlineHtml = typeof gradioData[7] === "string" ? gradioData[7] : "";
177
+ if (redlineHtml.includes("Clause Redlining")) {
178
+ const blocks = redlineHtml.split(/border-left:4px solid #/);
179
+ for (let i = 1; i < blocks.length; i++) {
180
+ const block = blocks[i];
181
+ const labelMatch = block.match(/font-weight:600[^>]*>([^<]+)<\/span>\s*<span[^>]*font-weight:600[^>]*>([^<]+)/);
182
+ const origMatch = block.match(/<del>([^<]*)<\/del>/);
183
+ const safeBlock = block.match(/Suggested Alternative[\s\S]*?<div[^>]*color:#166534[^>]*>([\s\S]*?)<\/div>/);
184
+ const legalMatch = block.match(/Legal Basis<\/div>\s*<div[^>]*>([^<]+)/);
185
+ const consumerMatch = block.match(/Consumer Standard<\/div>\s*<div[^>]*>([^<]+)/);
186
+ const isLLM = block.includes("LLM Refined");
187
+
188
+ if (labelMatch) {
189
+ redlines.push({
190
+ clause_label: labelMatch[1].trim(),
191
+ risk_level: labelMatch[2].trim(),
192
+ original_text: origMatch ? origMatch[1].trim() : "",
193
+ safe_alternative: safeBlock ? safeBlock[1].replace(/<[^>]+>/g, "").trim() : "",
194
+ legal_basis: legalMatch ? legalMatch[1].trim() : "",
195
+ consumer_standard: consumerMatch ? consumerMatch[1].trim() : "",
196
+ tier: isLLM ? "llm_refined" : "template",
197
+ });
198
+ }
199
+ }
200
+ }
201
+ }
202
+
203
+ const modelStatus = analysisData.metadata?.model || "";
204
+
205
+ // FIX v4.1: Increment scan count in profiles table
206
+ await supabase
207
+ .from("profiles")
208
+ .update({ analyses_this_month: scanCount + 1 })
209
+ .eq("id", user.id);
210
+
211
+ return NextResponse.json({
212
+ risk_score: riskScore,
213
+ grade,
214
+ total_clauses: totalClauses,
215
+ flagged_count: flaggedCount,
216
+ results,
217
+ entities: analysisData.entities || [],
218
+ contradictions: analysisData.contradictions || [],
219
+ obligations: analysisData.obligations || [],
220
+ compliance: analysisData.compliance || {},
221
+ redlines,
222
+ model: modelStatus.includes("loaded") ? "ml" : "regex",
223
+ latency_ms: 0,
224
+ session_id: null,
225
+ });
226
+ } catch (error: any) {
227
+ console.error("Analyze error:", error.message);
228
+ return NextResponse.json(
229
+ { error: "Analysis failed: " + (error.message || "Try again in 30 seconds.") },
230
+ { status: 500 }
231
+ );
232
+ }
233
+ }