{ "cells": [ { "cell_type": "markdown", "id": "4eda7add", "metadata": {}, "source": [ "# Weaviate" ] }, { "cell_type": "markdown", "id": "4956326d", "metadata": {}, "source": [ "## Ühenduse loomine Weaviate serveriga" ] }, { "cell_type": "code", "execution_count": 27, "id": "6f1ebd05", "metadata": {}, "outputs": [], "source": [ "from weaviate import WeaviateClient\n", "from weaviate.connect import ConnectionParams\n", "from weaviate.classes.query import Filter\n", "\n", "from pathlib import Path" ] }, { "cell_type": "code", "execution_count": null, "id": "ac2f77bf", "metadata": {}, "outputs": [], "source": [ "# Ühenda HTTP abil\n", "client = WeaviateClient(\n", " connection_params=ConnectionParams.from_params(\n", " http_host=\"100.87.1.24\",\n", " http_port=8080,\n", " http_secure=False,\n", " grpc_host=\"100.87.1.24\",\n", " grpc_port=50051,\n", " grpc_secure=False,\n", " )\n", ")\n", "client.connect()\n", "\n", "print(f\"✓ Ühendatud Weaviate'ga: {client.is_ready()}\")" ] }, { "cell_type": "markdown", "id": "49994031", "metadata": {}, "source": [ "## Muud käsud" ] }, { "cell_type": "code", "execution_count": 3, "id": "9201cb41", "metadata": {}, "outputs": [], "source": [ "collection = client.collections.get(\"ScientificArticle\")\n", "results = collection.query.fetch_objects(limit=10)\n", "print(f\"Leitud {len(results.objects)} objekti\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "93b0b0e8", "metadata": {}, "outputs": [], "source": [ "print(results)\n" ] }, { "cell_type": "code", "execution_count": 20, "id": "c3a7bbbb", "metadata": {}, "outputs": [], "source": [ "collection = client.collections.get('ScientificArticle')\n", "results = collection.query.fetch_objects(limit=2)\n", "\n", "for obj in results.objects:\n", " props = obj.properties\n", " print(f\"Title: {props.get('title')}, DOI: {props.get('doi')}, source_file: {props.get('source_file')}\")" ] }, { "cell_type": "code", "execution_count": 28, "id": "07970b96", "metadata": {}, "outputs": [], "source": [ "collection = client.collections.get('ScientificArticle')\n", "results = collection.query.fetch_objects(limit=2)\n", "\n", "for obj in results.objects:\n", " props = obj.properties\n", " filename = props.get('source_file')\n", " print(f\"filename: {filename}\")\n", " source_file = filename.strip()\n", " if source_file.startswith(\"./data/pdfs/\"):\n", " filename = source_file.replace(\"./data/pdfs/\", \"\")\n", " if source_file.startswith(\"data/pdfs/\"):\n", " filename = source_file.replace(\"data/pdfs/\", \"\")\n", " else:\n", " filename = Path(source_file).name\n", " print(f\"filename: {filename}\")" ] }, { "cell_type": "code", "execution_count": 45, "id": "6ccfa014", "metadata": {}, "outputs": [], "source": [ "# Leia artiklid, mis sisaldavad NUL-baite\n", "problem_files = [\n", " 'Shaver-StatisticalSignificanceTesting-1993.pdf',\n", " 'Safety-effectiveness-of-forward-collision-warning-syste2025Accident-Analys.pdf',\n", " '1910.13885v1.pdf',\n", " '2503.14666v1.pdf',\n", " '2503.14914v1.pdf'\n", "]\n", "\n", "collection = client.collections.get('ScientificArticle')\n", "results = collection.query.fetch_objects(limit=900)\n", "\n", "for obj in results.objects:\n", " props = obj.properties\n", " filename = props.get('source_file')\n", " source_file = filename.strip()\n", " if source_file.startswith(\"./data/pdfs/\"):\n", " filename = source_file.replace(\"./data/pdfs/\", \"\")\n", " if source_file.startswith(\"data/pdfs/\"):\n", " filename = source_file.replace(\"data/pdfs/\", \"\")\n", " else:\n", " filename = Path(source_file).name\n", "\n", " for problem_filename in problem_files:\n", " if problem_filename == filename:\n", " print(f\"clean filename: {filename}\")\n", " article = props.get('title')\n", " article_id = props.get('article_id')\n", "\n", " print(f\"Artikkel: {article} - {article_id}\")" ] }, { "cell_type": "markdown", "id": "86adec54", "metadata": {}, "source": [ "## Sulge ühendused" ] }, { "cell_type": "code", "execution_count": null, "id": "e654ae89", "metadata": {}, "outputs": [], "source": [ "client.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "2a93bd44", "metadata": {}, "outputs": [], "source": [ "# Kasuta\n", "try:\n", " collection = client.collections.get(\"ScientificArticle\")\n", " results = collection.query.fetch_objects(limit=10)\n", " print(f\"Leitud {len(results.objects)} objekti\")\n", "finally:\n", " client.close()\n" ] } ], "metadata": { "kernelspec": { "display_name": "PDF Pipeline (venv)", "language": "python", "name": "pdf-env" } }, "nbformat": 4, "nbformat_minor": 5 }