版本变更V3.35将图像的处理统一更换到新表当中

2026-05-26 11:28:26 +08:00
parent 682139bab8
commit fb5b8d873b
9 changed files with 281 additions and 70 deletions
--- a/inventory-backend/app/api/v1/common/image_search.py
+++ b/inventory-backend/app/api/v1/common/image_search.py
@ -19,6 +19,14 @@ from app.models.base import MaterialBase
 # 注册蓝图
 image_search_bp = Blueprint('image_search', __name__)

+# ============================================================================
+# 可配置参数
+# ============================================================================
+# 以图搜图相似度阈值：余弦距离必须小于此值（距离越小越相似）
+# 即余弦相似度 = 1 - 距离，必须 > (1 - SIMILARITY_THRESHOLD)
+# 默认 0.25 对应余弦相似度 > 0.75
+SIMILARITY_DISTANCE_THRESHOLD = 0.40
+

 # ============================================================================
 # POST /api/v1/common/image-search
@ -87,27 +95,80 @@ def image_search():
        ie.module_name,
        ie.target_id,
        ie.image_url,
-        (1 - (ie.embedding <=> :query_vector)) AS similarity
+        (1 - (ie.embedding <=> :query_vector)) AS similarity,
+        (ie.embedding <=> :query_vector)        AS distance
    FROM image_embeddings ie
    WHERE ie.embedding IS NOT NULL
+      AND (ie.embedding <=> :query_vector) < :distance_threshold
    ORDER BY ie.embedding <=> :query_vector
    LIMIT 200
 """)

-        raw_records = db.session.execute(sql, {"query_vector": query_vector_str}).fetchall()
+        raw_records = db.session.execute(sql, {
+            "query_vector": query_vector_str,
+            "distance_threshold": SIMILARITY_DISTANCE_THRESHOLD
+        }).fetchall()
        if not raw_records:
-            return jsonify({"code": 200, "data": []})
+            return jsonify({"code": 200, "data": [], "msg": "未找到相似图片（阈值过滤后）"})

-        # 按 (module_name, target_id) 去重，每业务记录只保留最相似的那张图
-        seen = {}
+        # ---------------------------------------------------------
+        # Step 1: 初步去重（同入库单只保留最相似的图片）
+        # ---------------------------------------------------------
+        first_img_seen = {}
+        unique_records = []
        for row in raw_records:
            key = (row.module_name, row.target_id)
-            if key not in seen:
-                seen[key] = row
+            if key not in first_img_seen:
+                first_img_seen[key] = True
+                unique_records.append(row)

-        # 批量回填业务数据
+        # ---------------------------------------------------------
+        # Step 2: 按物料维度去重（相同物料只保留第一条 = 相似度最高的那条）
+        # ---------------------------------------------------------
        target_ids_by_module = {}
-        for row in seen.values():
+        for row in unique_records:
+            target_ids_by_module.setdefault(row.module_name, []).append(row.target_id)
+
+        # 查询每条记录的 base_id（跨 stock_buy/semi/product/material_base）
+        base_id_map = {}
+
+        for module in ('stock_buy', 'stock_semi', 'stock_product'):
+            if module not in target_ids_by_module:
+                continue
+            ids = target_ids_by_module[module]
+            ModelCls = StockBuy if module == 'stock_buy' else (StockSemi if module == 'stock_semi' else StockProduct)
+            id_col = getattr(ModelCls, 'id')
+            base_col = getattr(ModelCls, 'base_id')
+
+            rows = (
+                db.session.query(id_col, base_col)
+                .outerjoin(MaterialBase, base_col == MaterialBase.id)
+                .filter(id_col.in_(ids))
+                .all()
+            )
+            for rec_id, base_id in rows:
+                base_id_map[(module, rec_id)] = base_id
+
+        if 'material_base' in target_ids_by_module:
+            for rec_id in target_ids_by_module['material_base']:
+                base_id_map[('material_base', rec_id)] = rec_id
+
+        # 按 base_id 去重：相同物料只保留第一张图
+        material_seen = {}
+        final_records = []
+        for row in unique_records:
+            base_id = base_id_map.get((row.module_name, row.target_id))
+            if base_id is not None and base_id in material_seen:
+                continue
+            if base_id is not None:
+                material_seen[base_id] = True
+            final_records.append(row)
+
+        # ---------------------------------------------------------
+        # Step 3: 批量回填业务数据（基于去重后的 final_records）
+        # ---------------------------------------------------------
+        target_ids_by_module = {}
+        for row in final_records:
            target_ids_by_module.setdefault(row.module_name, []).append(row.target_id)

        business_map = {}
@ -205,9 +266,9 @@ def image_search():
                    'url': '/material/index',
                }

-        # 组装最终返回
+        # 组装最终返回（基于 final_records，按相似度从高到低）
        results = []
-        for row in seen.values():
+        for row in final_records:
            key = (row.module_name, row.target_id)
            biz = business_map.get(key, {})
            raw_url = row.image_url or ''