GLM-OCR

@@ -1,15 +1,15 @@
 import torch
 from PIL import Image
-from transformers import AutoProcessor, AutoModelForVision2Seq
 import io
 import base64
 class EndpointHandler():
     def __init__(self, path=""):
-        # We MUST use trust_remote_code=True because the architecture
-        # is defined in the files you downloaded, not in the library itself.
-        self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
-        self.model = AutoModelForVision2Seq.from_pretrained(
             path,
             trust_remote_code=True,
             device_map="auto",
@@ -18,27 +18,21 @@ class EndpointHandler():
         self.model.eval()
     def __call__(self, data):
-        # Handle the input format from Google Sheets
         inputs_data = data.pop("inputs", data)
-        # If the data comes in as a string (base64)
-        if isinstance(inputs_data, str):
-            image_bytes = base64.b64decode(inputs_data)
-        else:
-            # Handle direct bytes if necessary
-            image_bytes = inputs_data
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        # The prompt that tells the AI what to look for
-        prompt = "Extract all line items. Return a JSON array of objects with: date, vendor, description, qty, price, total."
-        # Process the image and text
         model_inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.model.device)
         with torch.no_grad():
             generated_ids = self.model.generate(**model_inputs, max_new_tokens=1024)
         result = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return [{"generated_text": result}]

 import torch
 from PIL import Image
 import io
 import base64
+# We use the explicit classes to avoid the 'Auto' detection errors
+from transformers import GlmOcrProcessor, GlmOcrForConditionalGeneration
 class EndpointHandler():
     def __init__(self, path=""):
+        # Explicitly load the processor and model
+        self.processor = GlmOcrProcessor.from_pretrained(path, trust_remote_code=True)
+        self.model = GlmOcrForConditionalGeneration.from_pretrained(
             path,
             trust_remote_code=True,
             device_map="auto",
         self.model.eval()
     def __call__(self, data):
+        # Extract base64 from Google Apps Script
         inputs_data = data.pop("inputs", data)
+        image_bytes = base64.b64decode(inputs_data)
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        # Specific prompt for structured bookkeeping
+        prompt = "Identify Date, Vendor, and list every Item with description, qty, and price. Return as a JSON array."
+        # Process the input
         model_inputs = self.processor(text=prompt, images=image, return_tensors="pt").to(self.model.device)
         with torch.no_grad():
             generated_ids = self.model.generate(**model_inputs, max_new_tokens=1024)
+        # Decode output
         result = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return [{"generated_text": result}]