Spaces:
Running
Running
Commit ·
a724f5f
1
Parent(s): 0ae85bd
update
Browse files- retrieve.py +1 -46
retrieve.py
CHANGED
|
@@ -86,52 +86,7 @@ def search(
|
|
| 86 |
exc_info=True,
|
| 87 |
)
|
| 88 |
raise
|
| 89 |
-
|
| 90 |
-
def validate_chunk_sequencing(results: List[Dict[str, Any]]) -> bool:
|
| 91 |
-
"""
|
| 92 |
-
Verify that chunk_index values are properly assigned: integers >= 0 and unique per URL.
|
| 93 |
-
|
| 94 |
-
Note: Since search may return only a subset of chunks for a URL, we cannot
|
| 95 |
-
verify full sequential continuity (0,1,2,3...). Instead we check:
|
| 96 |
-
- All chunk_index values are integers >= 0
|
| 97 |
-
- No duplicate chunk_index for the same URL in the result set
|
| 98 |
-
|
| 99 |
-
Args:
|
| 100 |
-
results: List of search results
|
| 101 |
-
|
| 102 |
-
Returns:
|
| 103 |
-
True if chunk indices are valid, False otherwise
|
| 104 |
-
"""
|
| 105 |
-
# Group by URL
|
| 106 |
-
url_chunks = {}
|
| 107 |
-
for result in results:
|
| 108 |
-
payload = result.get("payload", {})
|
| 109 |
-
url = payload.get("url", "")
|
| 110 |
-
chunk_idx = payload.get("chunk_index")
|
| 111 |
-
|
| 112 |
-
if url not in url_chunks:
|
| 113 |
-
url_chunks[url] = []
|
| 114 |
-
url_chunks[url].append(chunk_idx)
|
| 115 |
-
|
| 116 |
-
# Check each URL's chunks are valid
|
| 117 |
-
for url, indices in url_chunks.items():
|
| 118 |
-
# All indices must be integers >= 0
|
| 119 |
-
for idx in indices:
|
| 120 |
-
if not isinstance(idx, int) or idx < 0:
|
| 121 |
-
logger.debug(
|
| 122 |
-
f"Invalid chunk_index for {url}: {idx} (must be non-negative integer)"
|
| 123 |
-
)
|
| 124 |
-
return False
|
| 125 |
-
|
| 126 |
-
# Check for duplicates (within this URL's results)
|
| 127 |
-
if len(set(indices)) != len(indices):
|
| 128 |
-
logger.debug(f"Duplicate chunk_index for {url}: {indices}")
|
| 129 |
-
return False
|
| 130 |
-
|
| 131 |
-
logger.debug(f"Chunk indexing valid for {len(url_chunks)} URLs")
|
| 132 |
-
return True
|
| 133 |
-
|
| 134 |
-
|
| 135 |
def search(
|
| 136 |
query_text: str,
|
| 137 |
cohere_client: cohere.ClientV2,
|
|
|
|
| 86 |
exc_info=True,
|
| 87 |
)
|
| 88 |
raise
|
| 89 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
def search(
|
| 91 |
query_text: str,
|
| 92 |
cohere_client: cohere.ClientV2,
|