GLM-OCR

@@ -2,54 +2,35 @@ import torch
 from PIL import Image
 import io
 import base64
-from transformers import AutoProcessor, AutoModelForImageTextToText
 class EndpointHandler():
     def __init__(self, path=""):
-        # This tells the library to load the custom code in your repo
-        self.processor = AutoProcessor.from_pretrained(path, trust_remote_code=True)
-        self.model = AutoModelForImageTextToText.from_pretrained(
             path,
-            trust_remote_code=True,
             device_map="auto",
             torch_dtype=torch.bfloat16
         )
         self.model.eval()
     def __call__(self, data):
-        # Extract base64 from Google Apps Script payload
         inputs_data = data.pop("inputs", data)
         image_bytes = base64.b64decode(inputs_data)
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-        # Bookkeeping prompt
-        prompt = "Identify Date, Vendor, and list every Item with description, qty, and price. Return as a JSON array."
-        # Use the specific chat template for GLM-OCR
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "image", "image": image},
-                    {"type": "text", "text": prompt},
-                ],
-            }
-        ]
-        # Prepare inputs
-        inputs = self.processor.apply_chat_template(
-            messages,
-            add_generation_prompt=True,
-            tokenize=True,
-            return_dict=True,
-            return_tensors="pt"
-        ).to(self.model.device)
-        # Generate the reading
         with torch.no_grad():
             generated_ids = self.model.generate(**inputs, max_new_tokens=1024)
-        # Decode the output
         result = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return [{"generated_text": result}]

 from PIL import Image
 import io
 import base64
+from transformers import GlmOcrProcessor, GlmOcrForConditionalGeneration
 class EndpointHandler():
     def __init__(self, path=""):
+        # Native 5.1.0 classes specifically for GLM-OCR
+        self.processor = GlmOcrProcessor.from_pretrained(path)
+        self.model = GlmOcrForConditionalGeneration.from_pretrained(
             path,
             device_map="auto",
             torch_dtype=torch.bfloat16
         )
         self.model.eval()
     def __call__(self, data):
+        # Extract base64 image from the 'inputs' field sent by Google Sheets
         inputs_data = data.pop("inputs", data)
         image_bytes = base64.b64decode(inputs_data)
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        # Bookkeeping prompt - Native formatting
+        prompt = "Extract receipt items into JSON: [{date, vendor, description, qty, price, total}]"
+        # New 5.1.0 process workflow
+        inputs = self.processor(images=image, text=prompt, return_tensors="pt").to(self.model.device)
         with torch.no_grad():
             generated_ids = self.model.generate(**inputs, max_new_tokens=1024)
+        # Decode results
         result = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
         return [{"generated_text": result}]