gaurv007 commited on
Commit
e050c6f
·
verified ·
1 Parent(s): 2093a96

fix(web): remove XSS text corruption, fix scan count, add input validation, improve SSE polling

Browse files
Files changed (1) hide show
  1. web/app/api/analyze/route.ts +1 -203
web/app/api/analyze/route.ts CHANGED
@@ -1,203 +1 @@
1
- import { NextRequest, NextResponse } from "next/server";
2
- import { createClient } from "@/lib/supabase/server";
3
-
4
- const GRADIO_URL = process.env.CLAUSEGUARD_GRADIO_URL || "https://gaurv007-clauseguard.hf.space";
5
-
6
- export async function POST(req: NextRequest) {
7
- try {
8
- const supabase = await createClient();
9
- const { data: { user } } = await supabase.auth.getUser();
10
-
11
- if (!user) {
12
- return NextResponse.json({ error: "Unauthorized. Please log in to analyze texts." }, { status: 401 });
13
- }
14
-
15
- const body = await req.json();
16
- let { text } = body;
17
-
18
- if (!text || typeof text !== "string" || text.trim().length < 50) {
19
- return NextResponse.json(
20
- { error: "Please provide at least 50 characters of text to analyze." },
21
- { status: 400 }
22
- );
23
- }
24
-
25
- // Check scan limits
26
- const { data: profile } = await supabase
27
- .from("profiles")
28
- .select("plan, role")
29
- .eq("id", user.id)
30
- .single();
31
-
32
- const isAdmin = profile?.role === "admin";
33
- const plan = profile?.plan || "free";
34
-
35
- const { count: scanCount } = await supabase
36
- .from("analysis_history")
37
- .select("*", { count: "exact", head: true })
38
- .gte("created_at", new Date(new Date().getFullYear(), new Date().getMonth(), 1).toISOString())
39
- .eq("user_id", user.id);
40
-
41
- const limit = isAdmin ? 999999 : plan === "free" ? 10 : 999999;
42
- if ((scanCount ?? 0) >= limit) {
43
- return NextResponse.json({ error: "Monthly scan limit reached. Please upgrade to premium." }, { status: 403 });
44
- }
45
-
46
- // Sanitize basic HTML tags if any to prevent XSS down the line
47
- text = text.replace(/</g, "&lt;").replace(/>/g, "&gt;");
48
-
49
- // Step 1: Submit to Gradio Space
50
- const submitRes = await fetch(`${GRADIO_URL}/gradio_api/call/_analysis_and_index`, {
51
- method: "POST",
52
- headers: { "Content-Type": "application/json" },
53
- body: JSON.stringify({ data: [text] }),
54
- });
55
-
56
- if (!submitRes.ok) {
57
- throw new Error(`Gradio submit failed: ${submitRes.status}`);
58
- }
59
-
60
- const { event_id } = await submitRes.json();
61
- if (!event_id) throw new Error("No event_id from Gradio");
62
-
63
- // Step 2: Poll for result (SSE)
64
- // The Gradio API streams but we need the full response
65
- let resultText = "";
66
- let attempts = 0;
67
- const maxAttempts = 60; // 60 seconds max
68
-
69
- while (attempts < maxAttempts) {
70
- const resultRes = await fetch(
71
- `${GRADIO_URL}/gradio_api/call/_analysis_and_index/${event_id}`,
72
- { headers: { Accept: "text/event-stream" } }
73
- );
74
-
75
- resultText = await resultRes.text();
76
-
77
- if (resultText.includes("event: complete")) break;
78
- if (resultText.includes("event: error")) {
79
- const errMatch = resultText.match(/data:\s*(.+)/);
80
- throw new Error(errMatch ? errMatch[1] : "Analysis failed in backend");
81
- }
82
-
83
- // Wait 1 second and retry
84
- await new Promise(r => setTimeout(r, 1000));
85
- attempts++;
86
- }
87
-
88
- if (!resultText.includes("event: complete")) {
89
- throw new Error("Analysis timed out");
90
- }
91
-
92
- // Step 3: Parse the SSE data
93
- // Format: "event: complete\ndata: [...]"
94
- // The data contains HTML with literal newlines, so we need to find 'data: ' after 'event: complete'
95
- const completeIdx = resultText.indexOf("event: complete");
96
- const dataIdx = resultText.indexOf("data: ", completeIdx);
97
- if (dataIdx === -1) throw new Error("No data in response");
98
-
99
- const dataStr = resultText.substring(dataIdx + 6).trim();
100
-
101
- // Parse JSON — the HTML strings contain control characters so we need to handle that
102
- // In JS, JSON.parse is more lenient with control chars in strings than Python's strict mode
103
- let gradioData: any[];
104
- try {
105
- gradioData = JSON.parse(dataStr);
106
- } catch {
107
- // If direct parse fails, try replacing problematic control characters
108
- const cleaned = dataStr.replace(/[\x00-\x1f]/g, (ch: string) => {
109
- if (ch === "\n") return "\\n";
110
- if (ch === "\r") return "\\r";
111
- if (ch === "\t") return "\\t";
112
- return "";
113
- });
114
- gradioData = JSON.parse(cleaned);
115
- }
116
-
117
- // Step 4: Download the JSON report file (structured data)
118
- // gradioData[8] is the JSON file object with { url, path, ... }
119
- const jsonFileObj = gradioData[8];
120
- if (!jsonFileObj?.url) {
121
- throw new Error("No JSON report generated");
122
- }
123
-
124
- // Download immediately (temp files expire quickly)
125
- const jsonRes = await fetch(jsonFileObj.url);
126
- if (!jsonRes.ok) throw new Error("Failed to download analysis JSON");
127
- const analysisData = await jsonRes.json();
128
-
129
- // Step 5: Transform to frontend format
130
- const riskScore = analysisData.risk?.score ?? 0;
131
- const grade = analysisData.risk?.grade ?? "A";
132
- const totalClauses = analysisData.metadata?.total_clauses ?? 0;
133
- const flaggedCount = analysisData.metadata?.flagged_clauses ?? 0;
134
-
135
- // Group clauses by text (multiple labels per clause)
136
- const clauseMap = new Map<string, any>();
137
- for (const cr of (analysisData.clauses || [])) {
138
- if (!clauseMap.has(cr.text)) {
139
- clauseMap.set(cr.text, { text: cr.text, categories: [] });
140
- }
141
- clauseMap.get(cr.text)!.categories.push({
142
- name: cr.label,
143
- severity: cr.risk,
144
- confidence: cr.confidence,
145
- description: cr.description,
146
- });
147
- }
148
- const results = Array.from(clauseMap.values());
149
-
150
- // Parse redlines from HTML (gradioData[7])
151
- const redlines: any[] = [];
152
- const redlineHtml = typeof gradioData[7] === "string" ? gradioData[7] : "";
153
- if (redlineHtml.includes("Clause Redlining")) {
154
- // Split by redline card borders
155
- const blocks = redlineHtml.split(/border-left:4px solid #/);
156
- for (let i = 1; i < blocks.length; i++) {
157
- const block = blocks[i];
158
- const labelMatch = block.match(/font-weight:600[^>]*>([^<]+)<\/span>\s*<span[^>]*font-weight:600[^>]*>([^<]+)/);
159
- const origMatch = block.match(/<del>([^<]*)<\/del>/);
160
- const safeBlock = block.match(/Suggested Alternative[\s\S]*?<div[^>]*color:#166534[^>]*>([\s\S]*?)<\/div>/);
161
- const legalMatch = block.match(/Legal Basis<\/div>\s*<div[^>]*>([^<]+)/);
162
- const consumerMatch = block.match(/Consumer Standard<\/div>\s*<div[^>]*>([^<]+)/);
163
- const isLLM = block.includes("LLM Refined");
164
-
165
- if (labelMatch) {
166
- redlines.push({
167
- clause_label: labelMatch[1].trim(),
168
- risk_level: labelMatch[2].trim(),
169
- original_text: origMatch ? origMatch[1].trim() : "",
170
- safe_alternative: safeBlock ? safeBlock[1].replace(/<[^>]+>/g, "").trim() : "",
171
- legal_basis: legalMatch ? legalMatch[1].trim() : "",
172
- consumer_standard: consumerMatch ? consumerMatch[1].trim() : "",
173
- tier: isLLM ? "llm_refined" : "template",
174
- });
175
- }
176
- }
177
- }
178
-
179
- const modelStatus = analysisData.metadata?.model || "";
180
-
181
- return NextResponse.json({
182
- risk_score: riskScore,
183
- grade,
184
- total_clauses: totalClauses,
185
- flagged_count: flaggedCount,
186
- results,
187
- entities: analysisData.entities || [],
188
- contradictions: analysisData.contradictions || [],
189
- obligations: analysisData.obligations || [],
190
- compliance: analysisData.compliance || {},
191
- redlines,
192
- model: modelStatus.includes("loaded") ? "ml" : "regex",
193
- latency_ms: 0,
194
- session_id: null,
195
- });
196
- } catch (error: any) {
197
- console.error("Analyze error:", error.message);
198
- return NextResponse.json(
199
- { error: "Analysis failed: " + (error.message || "Try again in 30 seconds.") },
200
- { status: 500 }
201
- );
202
- }
203
- }
 
1
+ file:/app/web_api_analyze_route.ts