Spaces:

gaurv007
/

ClauseGuard

Sleeping

App Files Files Community

gaurv007 commited on 12 days ago

Commit

333f825

verified ·

1 Parent(s): f782685

fix: upload actual analyze/route.ts content — XSS fix, scan count fix, input validation

Browse files

Files changed (1) hide show

web/app/api/analyze/route.ts +233 -1

web/app/api/analyze/route.ts CHANGED Viewed

	@@ -1 +1,233 @@
1	- ~~file:~~/~~app/web_api_analyze_route.ts~~

+import { NextRequest, NextResponse } from "next/server";
+import { createClient } from "@/lib/supabase/server";
+const GRADIO_URL = process.env.CLAUSEGUARD_GRADIO_URL || "https://gaurv007-clauseguard.hf.space";
+// FIX v4.1: Max text size validation (prevent oversized payloads)
+const MAX_TEXT_LENGTH = 200_000; // 200KB
+export async function POST(req: NextRequest) {
+  try {
+    const supabase = await createClient();
+    const { data: { user } } = await supabase.auth.getUser();
+    if (!user) {
+      return NextResponse.json({ error: "Unauthorized. Please log in to analyze texts." }, { status: 401 });
+    }
+    const body = await req.json();
+    let { text } = body;
+    if (!text || typeof text !== "string" || text.trim().length < 50) {
+      return NextResponse.json(
+        { error: "Please provide at least 50 characters of text to analyze." },
+        { status: 400 }
+      );
+    }
+    // FIX v4.1: Input size validation
+    if (text.length > MAX_TEXT_LENGTH) {
+      return NextResponse.json(
+        { error: `Text too long (${(text.length / 1000).toFixed(0)}KB). Maximum is ${MAX_TEXT_LENGTH / 1000}KB.` },
+        { status: 400 }
+      );
+    }
+    // FIX v4.1: REMOVED the XSS sanitization that corrupted contract text.
+    // The old code did: text = text.replace(/</g, "&lt;").replace(/>/g, "&gt;");
+    // This PERMANENTLY MUTATED the text before analysis, corrupting contracts
+    // that contain < or > characters (e.g., "shall not exceed >$10,000").
+    // Sanitization should happen at RENDER TIME in the frontend, not at analysis time.
+    // The frontend already uses React which auto-escapes HTML in JSX.
+    // Check scan limits — FIX v4.1: query the CORRECT table name
+    const { data: profile } = await supabase
+      .from("profiles")
+      .select("plan, role, analyses_this_month")
+      .eq("id", user.id)
+      .single();
+    const isAdmin = profile?.role === "admin";
+    const plan = profile?.plan || "free";
+    // FIX v4.1: Use analyses_this_month from profiles (already tracked), not a separate count query
+    const scanCount = profile?.analyses_this_month ?? 0;
+    const limit = isAdmin ? 999999 : plan === "free" ? 10 : 999999;
+    if (scanCount >= limit) {
+      return NextResponse.json({ error: "Monthly scan limit reached. Please upgrade to Pro." }, { status: 403 });
+    }
+    // Step 1: Submit to Gradio Space
+    const submitRes = await fetch(`${GRADIO_URL}/gradio_api/call/_analysis_and_index`, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ data: [text] }),
+    });
+    if (!submitRes.ok) {
+      throw new Error(`Gradio submit failed: ${submitRes.status}`);
+    }
+    const { event_id } = await submitRes.json();
+    if (!event_id) throw new Error("No event_id from Gradio");
+    // FIX v4.1: Improved SSE polling with proper streaming support
+    // Uses exponential backoff instead of fixed 1s intervals
+    let resultText = "";
+    let attempts = 0;
+    const maxAttempts = 90; // 90 seconds max (increased from 60)
+    let delay = 500; // Start at 500ms, increase
+    while (attempts < maxAttempts) {
+      const resultRes = await fetch(
+        `${GRADIO_URL}/gradio_api/call/_analysis_and_index/${event_id}`,
+        { headers: { Accept: "text/event-stream" } }
+      );
+      resultText = await resultRes.text();
+      if (resultText.includes("event: complete")) break;
+      if (resultText.includes("event: error")) {
+        const errMatch = resultText.match(/data:\s*(.+)/);
+        throw new Error(errMatch ? errMatch[1] : "Analysis failed in backend");
+      }
+      await new Promise(r => setTimeout(r, delay));
+      delay = Math.min(delay * 1.2, 2000); // Cap at 2s
+      attempts++;
+    }
+    if (!resultText.includes("event: complete")) {
+      throw new Error("Analysis timed out. The backend may be loading models. Please try again in 30 seconds.");
+    }
+    // Step 3: Parse the SSE data
+    const completeIdx = resultText.indexOf("event: complete");
+    const dataIdx = resultText.indexOf("data: ", completeIdx);
+    if (dataIdx === -1) throw new Error("No data in response");
+    const dataStr = resultText.substring(dataIdx + 6).trim();
+    let gradioData: any[];
+    try {
+      gradioData = JSON.parse(dataStr);
+    } catch {
+      const cleaned = dataStr.replace(/[\x00-\x1f]/g, (ch: string) => {
+        if (ch === "\n") return "\\n";
+        if (ch === "\r") return "\\r";
+        if (ch === "\t") return "\\t";
+        return "";
+      });
+      gradioData = JSON.parse(cleaned);
+    }
+    // Step 4: Download the JSON report file (structured data)
+    const jsonFileObj = gradioData[8];
+    if (!jsonFileObj?.url) {
+      throw new Error("No JSON report generated");
+    }
+    const jsonRes = await fetch(jsonFileObj.url);
+    if (!jsonRes.ok) throw new Error("Failed to download analysis JSON");
+    const analysisData = await jsonRes.json();
+    // Step 5: Transform to frontend format
+    const riskScore = analysisData.risk?.score ?? 0;
+    const grade = analysisData.risk?.grade ?? "A";
+    const totalClauses = analysisData.metadata?.total_clauses ?? 0;
+    const flaggedCount = analysisData.metadata?.flagged_clauses ?? 0;
+    // Group clauses by text (multiple labels per clause)
+    const clauseMap = new Map<string, any>();
+    for (const cr of (analysisData.clauses || [])) {
+      if (!clauseMap.has(cr.text)) {
+        clauseMap.set(cr.text, { text: cr.text, categories: [] });
+      }
+      clauseMap.get(cr.text)!.categories.push({
+        name: cr.label,
+        severity: cr.risk,
+        confidence: cr.confidence,
+        description: cr.description,
+      });
+    }
+    const results = Array.from(clauseMap.values());
+    // FIX v4.1: Parse redlines from structured JSON data instead of fragile HTML regex
+    const redlines: any[] = [];
+    // Try to extract redlines from the analysis JSON first (if available)
+    if (analysisData.redlines && Array.isArray(analysisData.redlines)) {
+      for (const rl of analysisData.redlines) {
+        redlines.push({
+          clause_label: rl.clause_label || "",
+          risk_level: rl.risk_level || "MEDIUM",
+          original_text: rl.original_text || "",
+          safe_alternative: rl.safe_alternative || "",
+          template_alternative: rl.template_alternative || "",
+          legal_basis: rl.legal_basis || "",
+          consumer_standard: rl.consumer_standard || "",
+          tier: rl.tier || "template",
+        });
+      }
+    }
+    // Fallback: try parsing from HTML only if no structured data
+    if (redlines.length === 0) {
+      const redlineHtml = typeof gradioData[7] === "string" ? gradioData[7] : "";
+      if (redlineHtml.includes("Clause Redlining")) {
+        const blocks = redlineHtml.split(/border-left:4px solid #/);
+        for (let i = 1; i < blocks.length; i++) {
+          const block = blocks[i];
+          const labelMatch = block.match(/font-weight:600[^>]*>([^<]+)<\/span>\s*<span[^>]*font-weight:600[^>]*>([^<]+)/);
+          const origMatch = block.match(/<del>([^<]*)<\/del>/);
+          const safeBlock = block.match(/Suggested Alternative[\s\S]*?<div[^>]*color:#166534[^>]*>([\s\S]*?)<\/div>/);
+          const legalMatch = block.match(/Legal Basis<\/div>\s*<div[^>]*>([^<]+)/);
+          const consumerMatch = block.match(/Consumer Standard<\/div>\s*<div[^>]*>([^<]+)/);
+          const isLLM = block.includes("LLM Refined");
+          if (labelMatch) {
+            redlines.push({
+              clause_label: labelMatch[1].trim(),
+              risk_level: labelMatch[2].trim(),
+              original_text: origMatch ? origMatch[1].trim() : "",
+              safe_alternative: safeBlock ? safeBlock[1].replace(/<[^>]+>/g, "").trim() : "",
+              legal_basis: legalMatch ? legalMatch[1].trim() : "",
+              consumer_standard: consumerMatch ? consumerMatch[1].trim() : "",
+              tier: isLLM ? "llm_refined" : "template",
+            });
+          }
+        }
+      }
+    }
+    const modelStatus = analysisData.metadata?.model || "";
+    // FIX v4.1: Increment scan count in profiles table
+    await supabase
+      .from("profiles")
+      .update({ analyses_this_month: scanCount + 1 })
+      .eq("id", user.id);
+    return NextResponse.json({
+      risk_score: riskScore,
+      grade,
+      total_clauses: totalClauses,
+      flagged_count: flaggedCount,
+      results,
+      entities: analysisData.entities || [],
+      contradictions: analysisData.contradictions || [],
+      obligations: analysisData.obligations || [],
+      compliance: analysisData.compliance || {},
+      redlines,
+      model: modelStatus.includes("loaded") ? "ml" : "regex",
+      latency_ms: 0,
+      session_id: null,
+    });
+  } catch (error: any) {
+    console.error("Analyze error:", error.message);
+    return NextResponse.json(
+      { error: "Analysis failed: " + (error.message || "Try again in 30 seconds.") },
+      { status: 500 }
+    );
+  }
+}