Spaces:

cstr
/

LLMProviders

Running

App Files Files Community

CrispStrobe commited on 29 days ago

Commit

704a297

1 Parent(s): 7cc1131

feat: simplify OCR display and optimize HF repo validation with caching

Browse files

Files changed (4) hide show

data/providers.json +0 -0
scripts/fetch-providers.js +2 -0
scripts/validate-hf-ids.js +49 -30
src/App.tsx +27 -31

data/providers.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

scripts/fetch-providers.js CHANGED Viewed

@@ -57,6 +57,8 @@ function updateProviderModels(providers, providerName, models) {
       ollama_id: newModel.ollama_id || existing.ollama_id,
       hf_private: newModel.hf_private ?? existing.hf_private,
       audio_price_per_1m: newModel.audio_price_per_1m || existing.audio_price_per_1m,
       capabilities: (newModel.capabilities && newModel.capabilities.length > 0)
         ? newModel.capabilities
         : existing.capabilities,

       ollama_id: newModel.ollama_id || existing.ollama_id,
       hf_private: newModel.hf_private ?? existing.hf_private,
       audio_price_per_1m: newModel.audio_price_per_1m || existing.audio_price_per_1m,
+      hf_validated_at: existing.hf_validated_at,
+      hf_status: existing.hf_status,
       capabilities: (newModel.capabilities && newModel.capabilities.length > 0)
         ? newModel.capabilities
         : existing.capabilities,

scripts/validate-hf-ids.js CHANGED Viewed

@@ -22,41 +22,60 @@ async function checkHfId(hfId) {
 }
 async function main() {
-  console.log('Starting Hugging Face Repository Validation...\n');
   const data = JSON.parse(fs.readFileSync(PROVIDERS_FILE, 'utf8'));
   const hfIdToModels = new Map();
   data.providers.forEach(p => {
     p.models.forEach(m => {
       if (m.hf_id) {
         if (!hfIdToModels.has(m.hf_id)) hfIdToModels.set(m.hf_id, []);
         hfIdToModels.get(m.hf_id).push(`${p.name}: ${m.name}`);
       }
     });
   });
   const ids = Array.from(hfIdToModels.keys());
-  console.log(`Found ${ids.length} unique HF IDs to validate across all providers.\n`);
   const invalidIds = new Set();
-  const results = {
-    valid: 0,
-    invalid: 0,
-    errors: 0
-  };
   for (let i = 0; i < ids.length; i++) {
     const id = ids[i];
     const progress = `[${i + 1}/${ids.length}]`.padEnd(10);
-    const check = await checkHfId(id);
     if (check.valid) {
-      results.valid++;
       console.log(`${progress} ✓ VALID   (${check.status}) ${id}`);
     } else {
-      results.invalid++;
       console.log(`${progress} ✗ INVALID (${check.status}) ${id}`);
       console.log(`          Used by: ${hfIdToModels.get(id).join(', ')}`);
       invalidIds.add(id);
@@ -66,30 +85,30 @@ async function main() {
     await new Promise(r => setTimeout(r, 50));
   }
-  console.log('\n' + '='.repeat(50));
-  console.log('VALIDATION SUMMARY');
-  console.log('='.repeat(50));
-  console.log(`Total Unique IDs:  ${ids.length}`);
-  console.log(`Valid IDs:         ${results.valid}`);
-  console.log(`Invalid (404s):    ${results.invalid}`);
-  console.log('='.repeat(50));
-  if (invalidIds.size > 0) {
-    console.log(`\nAction: Removing ${invalidIds.size} invalid HF IDs from providers.json...`);
-    let removalCount = 0;
-    data.providers.forEach(p => {
-      p.models.forEach(m => {
-        if (m.hf_id && invalidIds.has(m.hf_id)) {
           delete m.hf_id;
           removalCount++;
         }
-      });
     });
-    fs.writeFileSync(PROVIDERS_FILE, JSON.stringify(data, null, 2));
-    console.log(`Successfully removed ${removalCount} occurrences.`);
-  } else {
-    console.log('\nSuccess: All checked HF IDs exist on Hugging Face.');
-  }
 }
 main().catch(err => {

 }
 async function main() {
+  const force = process.argv.includes('--force');
+  console.log('Starting Hugging Face Repository Validation...');
+  if (force) console.log('  [!] Force mode enabled: checking all IDs regardless of cache.\n');
+  else console.log('  [i] Using cache: only checking IDs not validated in the last 30 days.\n');
   const data = JSON.parse(fs.readFileSync(PROVIDERS_FILE, 'utf8'));
   const hfIdToModels = new Map();
+  const hfIdMeta = new Map(); // Store metadata (validated_at, status)
   data.providers.forEach(p => {
     p.models.forEach(m => {
       if (m.hf_id) {
         if (!hfIdToModels.has(m.hf_id)) hfIdToModels.set(m.hf_id, []);
         hfIdToModels.get(m.hf_id).push(`${p.name}: ${m.name}`);
+        // Cache metadata if present
+        if (m.hf_validated_at && m.hf_status === 200) {
+          const existing = hfIdMeta.get(m.hf_id);
+          if (!existing || new Date(m.hf_validated_at) > new Date(existing.at)) {
+            hfIdMeta.set(m.hf_id, { at: m.hf_validated_at, status: m.hf_status });
+          }
+        }
       }
     });
   });
   const ids = Array.from(hfIdToModels.keys());
+  console.log(`Found ${ids.length} unique HF IDs to validate.\n`);
   const invalidIds = new Set();
+  const now = new Date();
+  const THIRTY_DAYS_MS = 30 * 24 * 60 * 60 * 1000;
+  const validationResults = new Map(); // id -> { status, at }
   for (let i = 0; i < ids.length; i++) {
     const id = ids[i];
     const progress = `[${i + 1}/${ids.length}]`.padEnd(10);
+    const cached = hfIdMeta.get(id);
+    const isRecent = cached && (now - new Date(cached.at) < THIRTY_DAYS_MS);
+    if (isRecent && !force) {
+      console.log(`${progress} ≈ CACHED  (${cached.status}) ${id} (last checked ${new Date(cached.at).toLocaleDateString()})`);
+      validationResults.set(id, { status: cached.status, at: cached.at });
+      continue;
+    }
+    const check = await checkHfId(id);
+    validationResults.set(id, { status: typeof check.status === 'number' ? check.status : 200, at: now.toISOString() });
     if (check.valid) {
       console.log(`${progress} ✓ VALID   (${check.status}) ${id}`);
     } else {
       console.log(`${progress} ✗ INVALID (${check.status}) ${id}`);
       console.log(`          Used by: ${hfIdToModels.get(id).join(', ')}`);
       invalidIds.add(id);
     await new Promise(r => setTimeout(r, 50));
   }
+  console.log('\nUpdating providers.json with validation results...');
+  let updatedCount = 0;
+  let removalCount = 0;
+  data.providers.forEach(p => {
+    p.models.forEach(m => {
+      if (m.hf_id) {
+        const res = validationResults.get(m.hf_id);
+        if (invalidIds.has(m.hf_id)) {
           delete m.hf_id;
+          delete m.hf_validated_at;
+          delete m.hf_status;
           removalCount++;
+        } else if (res) {
+          m.hf_validated_at = res.at;
+          m.hf_status = res.status;
+          updatedCount++;
         }
+      }
     });
+  });
+  fs.writeFileSync(PROVIDERS_FILE, JSON.stringify(data, null, 2));
+  console.log(`Done. Updated ${updatedCount} models, removed ${removalCount} invalid IDs.`);
 }
 main().catch(err => {

src/App.tsx CHANGED Viewed

@@ -372,8 +372,7 @@ function App() {
         case 'aa_intelligence':
         case 'aa_tokens_per_s':
         case 'mteb_avg':
-        case 'mteb_retrieval':
-        case 'ocr_avg': {
           try {
             const bA = findBenchmark(a.name);
             const bB = findBenchmark(b.name);
@@ -530,7 +529,6 @@ function App() {
                 <th onClick={() => requestSort('aa_tokens_per_s')} className="sortable" title="Artificial Analysis Median Speed (Tokens per Second)">AA Speed {getSortIcon('aa_tokens_per_s')}</th>
                 <th onClick={() => requestSort('mteb_avg')} className="sortable" title="MTEB (Massive Text Embedding Benchmark) Average">MTEB {getSortIcon('mteb_avg')}</th>
                 <th onClick={() => requestSort('mteb_retrieval')} className="sortable" title="MTEB Retrieval Average">MTEB-Ret {getSortIcon('mteb_retrieval')}</th>
-                <th onClick={() => requestSort('ocr_avg')} className="sortable" title="OCR (Optical Character Recognition) Benchmark">OCR {getSortIcon('ocr_avg')}</th>
                 <th onClick={() => requestSort('lb_global')} className="sortable" title="LiveBench overall average (contamination-free)">LB {getSortIcon('lb_global')}</th>
                 <th onClick={() => requestSort('lb_math')} className="sortable" title="LiveBench Mathematics">LB-Math {getSortIcon('lb_math')}</th>
                 <th onClick={() => requestSort('lb_coding')} className="sortable" title="LiveBench Coding + Agentic Coding">LB-Code {getSortIcon('lb_coding')}</th>
@@ -554,6 +552,7 @@ function App() {
                 (prev.hf_id?.toLowerCase() !== model.hf_id?.toLowerCase()) ||
                 (!model.hf_id && prev.name.toLowerCase() !== model.name.toLowerCase())
               );
               return (
                 <tr key={`${model.provider.name}-${model.name}-${idx}`} className={isGroupStart ? 'group-divider' : ''}>
                   <td className="provider-cell">{model.provider.name}</td>
@@ -592,6 +591,9 @@ function App() {
                           {model.capabilities && model.capabilities.length > 0 && (
                             <div className="tooltip-row"><strong>Caps:</strong> {model.capabilities.join(', ')}</div>
                           )}
                         </div>
                       </div>
                     </div>
@@ -626,34 +628,28 @@ function App() {
                       ? '–'
                       : formatPrice(model.output_price_per_1m, model.currency)}
                   </td>
-                  {showBenchmarks && (() => {
-                    try {
-                      const bm = findBenchmark(model.name);
-                      return <>
-                        <td className="benchmark-cell">{fmtNum(bm?.arena_elo)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.aider_pass_rate)}</td>
-                        <td className="benchmark-cell">{fmtNum(bm?.aa_intelligence)}</td>
-                        <td className="benchmark-cell">{fmtNum(bm?.aa_tokens_per_s)}</td>
-                        <td className="benchmark-cell">{fmtNum(bm?.mteb_avg, 1)}</td>
-                        <td className="benchmark-cell">{fmtNum(bm?.mteb_retrieval, 1)}</td>
-                        <td className="benchmark-cell">{fmtNum(bm?.ocr_avg, 1)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.lb_global)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.lb_math)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.lb_coding)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.lb_reasoning)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.gpqa)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.mmlu_pro)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.ifeval)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.bbh)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.hf_math_lvl5)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.hf_musr)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.mmlu)}</td>
-                        <td className="benchmark-cell">{fmtPct(bm?.human_eval)}</td>
-                      </>;
-                    } catch (e) {
-                      return Array(19).fill(null).map((_, i) => <td key={i} className="benchmark-cell">–</td>);
-                    }
-                  })()}
                 </tr>
               )
             })}

         case 'aa_intelligence':
         case 'aa_tokens_per_s':
         case 'mteb_avg':
+        case 'mteb_retrieval': {
           try {
             const bA = findBenchmark(a.name);
             const bB = findBenchmark(b.name);
                 <th onClick={() => requestSort('aa_tokens_per_s')} className="sortable" title="Artificial Analysis Median Speed (Tokens per Second)">AA Speed {getSortIcon('aa_tokens_per_s')}</th>
                 <th onClick={() => requestSort('mteb_avg')} className="sortable" title="MTEB (Massive Text Embedding Benchmark) Average">MTEB {getSortIcon('mteb_avg')}</th>
                 <th onClick={() => requestSort('mteb_retrieval')} className="sortable" title="MTEB Retrieval Average">MTEB-Ret {getSortIcon('mteb_retrieval')}</th>
                 <th onClick={() => requestSort('lb_global')} className="sortable" title="LiveBench overall average (contamination-free)">LB {getSortIcon('lb_global')}</th>
                 <th onClick={() => requestSort('lb_math')} className="sortable" title="LiveBench Mathematics">LB-Math {getSortIcon('lb_math')}</th>
                 <th onClick={() => requestSort('lb_coding')} className="sortable" title="LiveBench Coding + Agentic Coding">LB-Code {getSortIcon('lb_coding')}</th>
                 (prev.hf_id?.toLowerCase() !== model.hf_id?.toLowerCase()) ||
                 (!model.hf_id && prev.name.toLowerCase() !== model.name.toLowerCase())
               );
+              const bm = findBenchmark(model.name);
               return (
                 <tr key={`${model.provider.name}-${model.name}-${idx}`} className={isGroupStart ? 'group-divider' : ''}>
                   <td className="provider-cell">{model.provider.name}</td>
                           {model.capabilities && model.capabilities.length > 0 && (
                             <div className="tooltip-row"><strong>Caps:</strong> {model.capabilities.join(', ')}</div>
                           )}
+                          {bm?.ocr_avg !== undefined && (
+                            <div className="tooltip-row"><strong>OCR:</strong> {bm.ocr_avg.toFixed(1)} (Benchmark)</div>
+                          )}
                         </div>
                       </div>
                     </div>
                       ? '–'
                       : formatPrice(model.output_price_per_1m, model.currency)}
                   </td>
+                  {showBenchmarks && (
+                    <>
+                      <td className="benchmark-cell">{fmtNum(bm?.arena_elo)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.aider_pass_rate)}</td>
+                      <td className="benchmark-cell">{fmtNum(bm?.aa_intelligence)}</td>
+                      <td className="benchmark-cell">{fmtNum(bm?.aa_tokens_per_s)}</td>
+                      <td className="benchmark-cell">{fmtNum(bm?.mteb_avg, 1)}</td>
+                      <td className="benchmark-cell">{fmtNum(bm?.mteb_retrieval, 1)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.lb_global)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.lb_math)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.lb_coding)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.lb_reasoning)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.gpqa)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.mmlu_pro)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.ifeval)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.bbh)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.hf_math_lvl5)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.hf_musr)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.mmlu)}</td>
+                      <td className="benchmark-cell">{fmtPct(bm?.human_eval)}</td>
+                    </>
+                  )}
                 </tr>
               )
             })}