浏览代码

Pisimuudatused

ardo 3 月之前
父节点
当前提交
31f98e8dfe
共有 3 个文件被更改,包括 18 次插入42 次删除
  1. 4 1
      .env
  2. 12 40
      jupyter/WeaviateExportImport.ipynb
  3. 2 1
      src/config.py

+ 4 - 1
.env

@@ -1,3 +1,6 @@
 DEEPSEEK_API_KEY=sk-c6766d328c2446f78bfe509d3c7ad4b3
-WEAVIATE_URL=http://100.80.222.54:9020
+# hetzner
+#WEAVIATE_URL=http://100.80.222.54:9020
+# ohmu
+WEAVIATE_URL=http://100.87.1.24:8080
 PDF_SOURCE_DIR=./data/pdfs

+ 12 - 40
jupyter/WeaviateExportImport.ipynb

@@ -34,7 +34,9 @@
    "outputs": [],
    "source": [
     "# Ühenduse loomine\n",
-    "src_client = WeaviateExportImport.create_client(\"hetzner\", http_port=9020)"
+    "#src_client = WeaviateExportImport.create_client(\"hetzner\", http_port=9020)\n",
+    "\n",
+    "src_client = WeaviateExportImport.create_client(\"localhost\", http_port=8080)"
    ]
   },
   {
@@ -67,22 +69,6 @@
     "src_client.collections.delete(\"ScientificArticle\")\n"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ac436417",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Vektori vaatamine\n",
-    "results = dst_client.collections.get(\"Article\").query.fetch_objects(\n",
-    "    limit=2,\n",
-    "    include_vector=True\n",
-    ")\n",
-    "\n",
-    "print(results.objects[1].vector)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -91,18 +77,12 @@
    "outputs": [],
    "source": [
     "# Vektori vaatamine\n",
-    "results_src = src_client.collections.get(\"Article\").query.fetch_objects(\n",
-    "    limit=1,\n",
-    "    include_vector=True\n",
-    ")\n",
-    "\n",
-    "results_dst = dst_client.collections.get(\"Article\").query.fetch_objects(\n",
+    "results_src = src_client.collections.get(\"ScientificArticle\").query.fetch_objects(\n",
     "    limit=1,\n",
     "    include_vector=True\n",
     ")\n",
     "\n",
-    "print(results_src)\n",
-    "print(results_dst)"
+    "print(results_src)"
    ]
   },
   {
@@ -113,18 +93,12 @@
    "outputs": [],
    "source": [
     "# Vektori vaatamine\n",
-    "results_src = src_client.collections.get(\"DocIngest\").query.fetch_objects(\n",
-    "    limit=1,\n",
-    "    include_vector=True\n",
-    ")\n",
-    "\n",
-    "results_dst = dst_client.collections.get(\"DocIngest\").query.fetch_objects(\n",
+    "results_src = C.collections.get(\"DocIngest\").query.fetch_objects(\n",
     "    limit=1,\n",
     "    include_vector=True\n",
     ")\n",
     "\n",
-    "print(results_src)\n",
-    "print(results_dst)"
+    "print(results_src)"
    ]
   },
   {
@@ -134,9 +108,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "schema = src_client.collections.get(\"Article\").config.get()\n",
-    "print(schema)\n",
-    "schema = dst_client.collections.get(\"Article\").config.get()\n",
+    "schema = src_client.collections.get(\"ScientificArticle\").config.get()\n",
     "print(schema)"
    ]
   },
@@ -155,15 +127,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "wei.close_clients()"
+    "WeaviateExportImport.close_clients()"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": ".venv (3.10.12)",
+   "display_name": "PDF Pipeline (venv)",
    "language": "python",
-   "name": "python3"
+   "name": "pdf-env"
   },
   "language_info": {
    "codemirror_mode": {
@@ -175,7 +147,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,

+ 2 - 1
src/config.py

@@ -29,7 +29,8 @@ class Config:
     batch_size: int = int(os.getenv("BATCH_SIZE", "5"))
     
     # Embedding mudel
-    embedding_model: str = "all-MiniLM-L6-v2"
+    #embedding_model: str = "all-MiniLM-L6-v2"
+    embedding_model: str = "BAAI/bge-small-en-v1.5"
     
     # Veel seadeid
     enable_logging: bool = True