fix: chunk Qdrant upsert in batches van 200 punten
Grote regelingen (bv Besluit activiteiten leefomgeving met 2780 chunks) produceerden een te grote single-PUT payload naar Qdrant, wat tot writetimeout leidde. De volledige regeling kreeg dan 0 punten. Oplossing: de punten in batches van QDRANT_UPLOAD_BATCH=200 uploaden. Dit houdt elke PUT rond ~1MB en voorkomt timeouts zonder een merkbare performance-impact (14 PUTs × 150ms ipv 1 × 3s). Tijdens de full 40k run faalde alleen BWBR0041330 op deze manier. Die is via een ad-hoc script al nageïndexeerd (2780 punten).
This commit is contained in:
parent
6a5bdf3f08
commit
524f6a749b
1 changed files with 5 additions and 2 deletions
|
|
@ -29,6 +29,7 @@ COLLECTION = "wetgit_artikelen"
|
||||||
VECTOR_DIM = 1024 # mistral-embed output dimension
|
VECTOR_DIM = 1024 # mistral-embed output dimension
|
||||||
|
|
||||||
MISTRAL_BATCH_SIZE = 32
|
MISTRAL_BATCH_SIZE = 32
|
||||||
|
QDRANT_UPLOAD_BATCH = 200 # punten per PUT — voorkomt writetimeout op reuzen-regelingen
|
||||||
CHUNK_CHAR_LIMIT = 1800
|
CHUNK_CHAR_LIMIT = 1800
|
||||||
CHUNK_OVERLAP = 200
|
CHUNK_OVERLAP = 200
|
||||||
|
|
||||||
|
|
@ -177,13 +178,15 @@ class SemanticSearch:
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
if points:
|
for i in range(0, len(points), QDRANT_UPLOAD_BATCH):
|
||||||
|
batch = points[i:i + QDRANT_UPLOAD_BATCH]
|
||||||
resp = httpx.put(
|
resp = httpx.put(
|
||||||
f"{self.qdrant_url}/collections/{COLLECTION}/points",
|
f"{self.qdrant_url}/collections/{COLLECTION}/points",
|
||||||
json={"points": points},
|
json={"points": batch},
|
||||||
timeout=60,
|
timeout=60,
|
||||||
)
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
if points:
|
||||||
logger.debug("Qdrant upsert %s — %d punten", bwb_id, len(points))
|
logger.debug("Qdrant upsert %s — %d punten", bwb_id, len(points))
|
||||||
|
|
||||||
return len(points)
|
return len(points)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue