3 månader sedan · 31e2b690f3
--- a/.env
+++ b/.env
@@ -0,0 +1,41 @@
 
				+# PostgreSQL
			
 
				+DB_HOST=100.87.1.24
			
 
				+DB_PORT=5432
			
 
				+DB_NAME=pdf_research
			
 
				+DB_USER=osm
			
 
				+DB_PASSWORD=osm
			
 
				+DATABASE_URL='postgresql://osm:osm@100.87.1.24:5432/pdf_research'
			
 
				+
			
 
				+
			
 
				+# DeepSeek (hiljem)
			
 
				+DEEPSEEK_API_KEY=sk-c6766d328c2446f78bfe509d3c7ad4b3
			
 
				+EMBED_MODEL=deepseek-embedding-v2
			
 
				+EMBED_MODEL_NAME=BAAI/bge-small-en-v1.5
			
 
				+
			
 
				+LLAMA_CPP_URL=http://100.87.1.24:8070/completion
			
 
				+LLAMA_EMBED_BIN=/home/ardo/llama.cpp/build/bin/llama-embedding
			
 
				+LLAMA_EMBED_MODEL=/models/mistral-7b-v0.1.Q4_K_M.gguf
			
 
				+EMBED_DIM=384
			
 
				+
			
 
				+# Open AI
			
 
				+OPENAI_API_KEY=sk-None-opa0spEeiLKr0wsQCDa9T3BlbkFJbkv6l9uZUA6xNTEiBK5R
			
 
				+
			
 
				+# Projektikseaded
			
 
				+PDF_INPUT_DIR=data/raw_pdfs
			
 
				+EXTRACTED_OUTPUT_DIR=data/extracted
			
 
				+PROCESSED_OUTPUT_DIR=data/processed
			
 
				+CHUNKS_OUTPUT_DIR=data/chunks
			
 
				+
			
 
				+CHUNK_SIZE=1024
			
 
				+CHUNK_OVERLAP=200
			
 
				+
			
 
				+# Weaviate
			
 
				+#WEAVIATE_URL=http://localhost:8080
			
 
				+WEAVIATE_HOST=100.87.1.24
			
 
				+WEAVIATE_HTTP_PORT=8080
			
 
				+WEAVIATE_URL=http://100.87.1.24:8080
			
 
				+WEAVIATE_API_KEY=
			
 
				+WEAVIATE_CLASS=ScientificArticle
			
 
				+
			
 
				+# Ollama
			
 
				+OLLAMA_URL=http://100.87.1.24:11434
			
--- a/.gitignore
+++ b/.gitignore
@@ -58,7 +58,7 @@ docs/_build/
 
				 # PyBuilder
			
 
				 target/
			
 
				 
			
 
				-# Ardo
			
 
				+# ArdoS
			
 
				 testimiseks/
			
 
				 data/chunks
			
 
				 data/extracted
			
@@ -66,3 +66,4 @@ data/processed
 
				 data/raw_pdfs
			
 
				 archive/
			
 
				 output/
			
 
				+help/tmp
			
--- a/LOEMIND.md
+++ b/LOEMIND.md
@@ -14,11 +14,17 @@ python -m src.extract_pdf  # parandatud ekstraheerimisega
 
				 python -m src.check_status  # parema ülevaatega
			
 
				 
			
 
				 python -m src.find_duplicates  # duplikaatide kontroll
			
 
				+# dokumentide protsessimine
			
 
				 python -m src.clean_and_normalize
			
 
				 
			
 
				 python -m src.create_chunks
			
 
				+# Järgmises koodis on parameeter BATCH_LIMIT
			
 
				+# Nii kaua tuleb käivitada koodi kuni kõik embeddingud on lisatud
			
 
				 python -m src.embed_chunks
			
 
				 
			
 
				+# Weaviate baasiga sünkroniseerimine
			
 
				+python -m src.sync_weaviate
			
 
				+
			
 
				 # Testimine
			
 
				 docker exec -it postgres_postgis psql -U osm -d pdf_research -h localhost -c "SELECT COUNT(*) FROM processed_documents;"
			
 
				 docker exec -it postgres_postgis psql -U osm -d pdf_research -h localhost -c "
			
@@ -28,8 +34,6 @@ docker exec -it postgres_postgis psql -U osm -d pdf_research -h localhost -c "
 
				   LIMIT 20;
			
 
				 "
			
 
				 
			
 
				-# Weaviate baasiga sünkroniseerimine
			
 
				-python -m src.sync_weaviate
			
 
				 
			
 
				 python -m src.query_hybrid
			
 
				 python -m src.generate_answer
			
--- a/help/huvitavate_asjade_nimekiri.md
+++ b/help/huvitavate_asjade_nimekiri.md
@@ -0,0 +1,3 @@
 
				+# Huvitavad asjad
			
 
				+
			
 
				+1. Gravitatsiooni artikkel - 1009.6090v1.pdf
			
--- a/jupyter/weaviate_testimine.ipynb
+++ b/jupyter/weaviate_testimine.ipynb
@@ -0,0 +1,208 @@
 
				+{
			
 
				+ "cells": [
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "4eda7add",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "# Weaviate"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "4956326d",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Ühenduse loomine Weaviate serveriga"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 27,
			
 
				+   "id": "6f1ebd05",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "from weaviate import WeaviateClient\n",
			
 
				+    "from weaviate.connect import ConnectionParams\n",
			
 
				+    "from weaviate.classes.query import Filter\n",
			
 
				+    "\n",
			
 
				+    "from pathlib import Path"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "ac2f77bf",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Ühenda HTTP abil\n",
			
 
				+    "client = WeaviateClient(\n",
			
 
				+    "    connection_params=ConnectionParams.from_params(\n",
			
 
				+    "        http_host=\"100.87.1.24\",\n",
			
 
				+    "        http_port=8080,\n",
			
 
				+    "        http_secure=False,\n",
			
 
				+    "        grpc_host=\"100.87.1.24\",\n",
			
 
				+    "        grpc_port=50051,\n",
			
 
				+    "        grpc_secure=False,\n",
			
 
				+    "    )\n",
			
 
				+    ")\n",
			
 
				+    "client.connect()\n",
			
 
				+    "\n",
			
 
				+    "print(f\"✓ Ühendatud Weaviate'ga: {client.is_ready()}\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "49994031",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Muud käsud"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 3,
			
 
				+   "id": "9201cb41",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "collection = client.collections.get(\"ScientificArticle\")\n",
			
 
				+    "results = collection.query.fetch_objects(limit=10)\n",
			
 
				+    "print(f\"Leitud {len(results.objects)} objekti\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 15,
			
 
				+   "id": "93b0b0e8",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "print(results)\n"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 20,
			
 
				+   "id": "c3a7bbbb",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "collection = client.collections.get('ScientificArticle')\n",
			
 
				+    "results = collection.query.fetch_objects(limit=2)\n",
			
 
				+    "\n",
			
 
				+    "for obj in results.objects:\n",
			
 
				+    "    props = obj.properties\n",
			
 
				+    "    print(f\"Title: {props.get('title')}, DOI: {props.get('doi')}, source_file: {props.get('source_file')}\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 28,
			
 
				+   "id": "07970b96",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "collection = client.collections.get('ScientificArticle')\n",
			
 
				+    "results = collection.query.fetch_objects(limit=2)\n",
			
 
				+    "\n",
			
 
				+    "for obj in results.objects:\n",
			
 
				+    "    props = obj.properties\n",
			
 
				+    "    filename = props.get('source_file')\n",
			
 
				+    "    print(f\"filename: {filename}\")\n",
			
 
				+    "    source_file = filename.strip()\n",
			
 
				+    "    if source_file.startswith(\"./data/pdfs/\"):\n",
			
 
				+    "        filename = source_file.replace(\"./data/pdfs/\", \"\")\n",
			
 
				+    "        if source_file.startswith(\"data/pdfs/\"):\n",
			
 
				+    "            filename = source_file.replace(\"data/pdfs/\", \"\")\n",
			
 
				+    "        else:\n",
			
 
				+    "            filename = Path(source_file).name\n",
			
 
				+    "    print(f\"filename: {filename}\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": 45,
			
 
				+   "id": "6ccfa014",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Leia artiklid, mis sisaldavad NUL-baite\n",
			
 
				+    "problem_files = [\n",
			
 
				+    "    'Shaver-StatisticalSignificanceTesting-1993.pdf',\n",
			
 
				+    "    'Safety-effectiveness-of-forward-collision-warning-syste2025Accident-Analys.pdf',\n",
			
 
				+    "    '1910.13885v1.pdf',\n",
			
 
				+    "    '2503.14666v1.pdf',\n",
			
 
				+    "    '2503.14914v1.pdf'\n",
			
 
				+    "]\n",
			
 
				+    "\n",
			
 
				+    "collection = client.collections.get('ScientificArticle')\n",
			
 
				+    "results = collection.query.fetch_objects(limit=900)\n",
			
 
				+    "\n",
			
 
				+    "for obj in results.objects:\n",
			
 
				+    "    props = obj.properties\n",
			
 
				+    "    filename = props.get('source_file')\n",
			
 
				+    "    source_file = filename.strip()\n",
			
 
				+    "    if source_file.startswith(\"./data/pdfs/\"):\n",
			
 
				+    "        filename = source_file.replace(\"./data/pdfs/\", \"\")\n",
			
 
				+    "        if source_file.startswith(\"data/pdfs/\"):\n",
			
 
				+    "            filename = source_file.replace(\"data/pdfs/\", \"\")\n",
			
 
				+    "        else:\n",
			
 
				+    "            filename = Path(source_file).name\n",
			
 
				+    "\n",
			
 
				+    "    for problem_filename in problem_files:\n",
			
 
				+    "        if problem_filename == filename:\n",
			
 
				+    "            print(f\"clean filename: {filename}\")\n",
			
 
				+    "            article = props.get('title')\n",
			
 
				+    "            article_id = props.get('article_id')\n",
			
 
				+    "\n",
			
 
				+    "            print(f\"Artikkel: {article} - {article_id}\")"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "id": "86adec54",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				+    "## Sulge ühendused"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "e654ae89",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "client.close()"
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "code",
			
 
				+   "execution_count": null,
			
 
				+   "id": "2a93bd44",
			
 
				+   "metadata": {},
			
 
				+   "outputs": [],
			
 
				+   "source": [
			
 
				+    "# Kasuta\n",
			
 
				+    "try:\n",
			
 
				+    "    collection = client.collections.get(\"ScientificArticle\")\n",
			
 
				+    "    results = collection.query.fetch_objects(limit=10)\n",
			
 
				+    "    print(f\"Leitud {len(results.objects)} objekti\")\n",
			
 
				+    "finally:\n",
			
 
				+    "    client.close()\n"
			
 
				+   ]
			
 
				+  }
			
 
				+ ],
			
 
				+ "metadata": {
			
 
				+  "kernelspec": {
			
 
				+   "display_name": "PDF Pipeline (venv)",
			
 
				+   "language": "python",
			
 
				+   "name": "pdf-env"
			
 
				+  }
			
 
				+ },
			
 
				+ "nbformat": 4,
			
 
				+ "nbformat_minor": 5
			
 
				+}
			
--- a/scripts/db_vector_välja_muutmine.sh
+++ b/scripts/db_vector_välja_muutmine.sh
@@ -16,11 +16,24 @@ docker exec -it postgres_postgis bash -c "
 
				   \"
			
 
				 "
			
 
				 
			
 
				-# Lisa raw_documents tabelisse weaviate_article_id veerg
			
 
				+# Lisa raw_documents tabelisse täiendavad veerud weaviate baasist ülekandmiseks
			
 
				 docker exec -it postgres_postgis bash -c "
			
 
				   export PGPASSWORD='osm' &&
			
 
				   psql -U osm -h localhost -d pdf_research -c \"
			
 
				-ALTER TABLE raw_documents ADD COLUMN weaviate_article_id UUID;
			
 
				+ALTER TABLE raw_documents
			
 
				+  ADD COLUMN IF NOT EXISTS weaviate_article_id UUID,
			
 
				+  ADD COLUMN IF NOT EXISTS title TEXT,
			
 
				+  ADD COLUMN IF NOT EXISTS doi TEXT,
			
 
				+  ADD COLUMN IF NOT EXISTS journal TEXT,
			
 
				+  ADD COLUMN IF NOT EXISTS year REAL,
			
 
				+  ADD COLUMN IF NOT EXISTS authors TEXT[],
			
 
				+  ADD COLUMN IF NOT EXISTS key_concepts TEXT[],
			
 
				+  ADD COLUMN IF NOT EXISTS methods_used TEXT[],
			
 
				+  ADD COLUMN IF NOT EXISTS summary_et TEXT,
			
 
				+  ADD COLUMN IF NOT EXISTS abstract_en TEXT,
			
 
				+  ADD COLUMN IF NOT EXISTS transport_context JSONB,
			
 
				+  ADD COLUMN IF NOT EXISTS relevance_score REAL,
			
 
				+  ADD COLUMN IF NOT EXISTS processing_date TIMESTAMPTZ;
			
 
				 CREATE INDEX idx_raw_weaviate_id ON raw_documents(weaviate_article_id);
			
 
				   \"
			
 
				 "
			
@@ -31,3 +44,4 @@ docker exec -it postgres_postgis bash -c "
 
				   SELECT column_name, data_type FROM information_schema.columns WHERE table_name = 'raw_documents' ORDER BY ordinal_position;
			
 
				   \"
			
 
				 "
			
 
				+
			
--- a/src/clean_and_normalize.py
+++ b/src/clean_and_normalize.py
@@ -1,6 +1,8 @@
 
				 # file: src/clean_and_normalize.py
			
 
				+
			
 
				 import os
			
 
				 import json
			
 
				+import re
			
 
				 from pathlib import Path
			
 
				 from dotenv import load_dotenv
			
 
				 import psycopg2
			
@@ -25,81 +27,156 @@ def get_db_conn():
 
				     )
			
 
				 
			
 
				 def clean_text_basic(text: str) -> str:
			
 
				-    # Väga lihtne puhastus: trimmime, asendame mitmekordsed tühikud ühega
			
 
				-    import re
			
 
				+    """
			
 
				+    Puhastab tekstist ohtlikud karakterid:
			
 
				+    - NUL baited (0x00)
			
 
				+    - Juhtimiskarakateid
			
 
				+    - Mitmekordseid tühikuid
			
 
				+    - Mitmekordseid reavahetus
			
 
				+    """
			
 
				     if not text:
			
 
				         return ""
			
 
				+    
			
 
				+    # ⚠️ OLULINE: Eemalda NUL baited
			
 
				+    text = text.replace("\x00", "")
			
 
				+    
			
 
				+    # Eemalda muud juhtimiskarakateid (jäta \n, \t, \r)
			
 
				+    # Loe ainult nähtavaid karaktereid
			
 
				+    text = "".join(
			
 
				+        ch for ch in text 
			
 
				+        if ord(ch) >= 32 or ch in "\n\t\r"
			
 
				+    )
			
 
				+    
			
 
				+    # Normaalne puhastus
			
 
				     text = text.replace("\r", "\n")
			
 
				-    text = text.replace("\u00a0", " ")  # non‑breaking space
			
 
				-    text = re.sub(r"[ \t]+", " ", text)
			
 
				-    text = re.sub(r"\n{3,}", "\n\n", text)
			
 
				+    text = text.replace("\u00a0", " ")  # non-breaking space
			
 
				+    text = re.sub(r"[ \t]+", " ", text)  # mitmekordsed tühikud
			
 
				+    text = re.sub(r"\n{3,}", "\n\n", text)  # mitmekordsed reavahetus
			
 
				+    
			
 
				     return text.strip()
			
 
				 
			
 
				 def process_single_doc(raw_doc_id: int, filename: str):
			
 
				+    """
			
 
				+    Töötleb ühe dokumendi pages.json failist ja sisestab 
			
 
				+    cleaned teksti processed_documents tabelisse.
			
 
				+    """
			
 
				     pdf_stem = Path(filename).stem
			
 
				     json_path = EXTRACTED_OUTPUT_DIR / pdf_stem / "pages.json"
			
 
				-
			
 
				+    
			
 
				     if not json_path.exists():
			
 
				         print(f"❌ pages.json puudub: {json_path}")
			
 
				         return
			
 
				-
			
 
				-    pages_data = json.loads(json_path.read_text())
			
 
				-
			
 
				+    
			
 
				+    # JSON laadimine error handling-ga
			
 
				+    try:
			
 
				+        # Kasuta errors='replace' et käidelda vigaseid baite
			
 
				+        raw_json_content = json_path.read_text(encoding='utf-8', errors='replace')
			
 
				+        pages_data = json.loads(raw_json_content)
			
 
				+    except json.JSONDecodeError as e:
			
 
				+        print(f"❌ JSON parse error {filename}: {e}")
			
 
				+        return
			
 
				+    except Exception as e:
			
 
				+        print(f"❌ Viga JSON laadimisel {filename}: {e}")
			
 
				+        return
			
 
				+    
			
 
				     conn = get_db_conn()
			
 
				     cur = conn.cursor()
			
 
				-
			
 
				     inserted = 0
			
 
				-
			
 
				-    for page in pages_data:
			
 
				-        page_num = page["page"]
			
 
				-        raw_text = page.get("text") or ""
			
 
				-        text = clean_text_basic(raw_text)
			
 
				-
			
 
				-        if not text:
			
 
				-            print(f"  ⚠️ Leht {page_num}: tühi tekst, jätan vahele")
			
 
				-            continue
			
 
				-
			
 
				-        cur.execute("""
			
 
				-            INSERT INTO processed_documents
			
 
				-                (raw_doc_id, page, section, content_text, has_table, table_data, bbox)
			
 
				-            VALUES
			
 
				-                (%s, %s, %s, %s, %s, %s, %s)
			
 
				-        """, (
			
 
				-            raw_doc_id,
			
 
				-            page_num,
			
 
				-            f"page_{page_num}",
			
 
				-            text,
			
 
				-            False,
			
 
				-            None,
			
 
				-            None,
			
 
				-        ))
			
 
				-        inserted += 1
			
 
				-
			
 
				-    conn.commit()
			
 
				-    cur.close()
			
 
				-    conn.close()
			
 
				-
			
 
				-    print(f"✓ {filename}: lisatud {inserted} rida processed_documents tabelisse")
			
 
				+    failed = 0
			
 
				+    
			
 
				+    try:
			
 
				+        for page_data in pages_data:
			
 
				+            try:
			
 
				+                page_num = page_data.get("page", 0)
			
 
				+                raw_text = page_data.get("text") or ""
			
 
				+                
			
 
				+                # Puhasta teksti
			
 
				+                text = clean_text_basic(raw_text)
			
 
				+                
			
 
				+                if not text:
			
 
				+                    print(f"  ⚠️ Leht {page_num}: tühi tekst pärast puhastamist")
			
 
				+                    failed += 1
			
 
				+                    continue
			
 
				+                
			
 
				+                # Lisa kaitse: kontrolli, kas jälle NUL baite
			
 
				+                if "\x00" in text:
			
 
				+                    print(f"  ⚠️ Leht {page_num}: NUL baite pärast puhastamist?")
			
 
				+                    text = text.replace("\x00", "")
			
 
				+                
			
 
				+                # INSERT processed_documents-i
			
 
				+                cur.execute("""
			
 
				+                    INSERT INTO processed_documents
			
 
				+                    (raw_doc_id, page, section, content_text, has_table, table_data, bbox)
			
 
				+                    VALUES
			
 
				+                    (%s, %s, %s, %s, %s, %s, %s)
			
 
				+                """, (
			
 
				+                    raw_doc_id,
			
 
				+                    page_num,
			
 
				+                    f"page_{page_num}",
			
 
				+                    text,
			
 
				+                    False,
			
 
				+                    None,
			
 
				+                    None,
			
 
				+                ))
			
 
				+                
			
 
				+                inserted += 1
			
 
				+                
			
 
				+            except ValueError as e:
			
 
				+                # NUL byte viga
			
 
				+                if "NUL" in str(e):
			
 
				+                    print(f"  ⚠️ Leht {page_num}: NUL byte error - jätan vahele")
			
 
				+                    failed += 1
			
 
				+                else:
			
 
				+                    raise
			
 
				+            
			
 
				+            except Exception as e:
			
 
				+                print(f"  ⚠️ Leht {page_num} error: {type(e).__name__}: {e}")
			
 
				+                failed += 1
			
 
				+                continue
			
 
				+        
			
 
				+        # Kõik insertid tehtud, üks commit
			
 
				+        if inserted > 0:
			
 
				+            conn.commit()
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        conn.rollback()
			
 
				+        print(f"❌ Viga {filename} töötlemises: {e}")
			
 
				+        return
			
 
				+    
			
 
				+    finally:
			
 
				+        cur.close()
			
 
				+        conn.close()
			
 
				+    
			
 
				+    print(f"✓ {filename}: lisatud {inserted} rida, {failed} vahele jäänud")
			
 
				 
			
 
				 def process_all_docs():
			
 
				+    """Töötleb kõik raw_documents failid."""
			
 
				     conn = get_db_conn()
			
 
				     cur = conn.cursor()
			
 
				-
			
 
				-    # Võtame kõik raw dokumendid
			
 
				-    cur.execute("SELECT id, filename FROM raw_documents ORDER BY id")
			
 
				-    docs = cur.fetchall()
			
 
				-    cur.close()
			
 
				-    conn.close()
			
 
				-
			
 
				+    
			
 
				+    try:
			
 
				+        # Võtame kõik raw dokumendid
			
 
				+        cur.execute("SELECT id, filename FROM raw_documents ORDER BY id")
			
 
				+        docs = cur.fetchall()
			
 
				+    except Exception as e:
			
 
				+        print(f"❌ Viga dokumentide loetlemises: {e}")
			
 
				+        return
			
 
				+    finally:
			
 
				+        cur.close()
			
 
				+        conn.close()
			
 
				+    
			
 
				     if not docs:
			
 
				         print("❌ raw_documents on tühi, esmalt käivita extract_pdf.py")
			
 
				         return
			
 
				-
			
 
				-    for raw_doc_id, filename in docs:
			
 
				-        print(f"🔧 Töötlen: {filename} (raw_doc_id={raw_doc_id})")
			
 
				+    
			
 
				+    total = len(docs)
			
 
				+    for idx, (raw_doc_id, filename) in enumerate(docs, 1):
			
 
				+        print(f"\n🔧 Töötlen: {filename} (raw_doc_id={raw_doc_id}) [{idx}/{total}]")
			
 
				         process_single_doc(raw_doc_id, filename)
			
 
				-
			
 
				-    print("✓ Kõik dokumendid töödeldud (lehe‑tasemel tekstina).")
			
 
				+    
			
 
				+    print("\n" + "="*70)
			
 
				+    print("✓ Kõik dokumendid töödeldud (lehe-tasemel tekstina).")
			
 
				+    print("="*70)
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     process_all_docs()
			
--- a/src/embed_utils.py
+++ b/src/embed_utils.py
@@ -1,12 +1,14 @@
 
				 # file: src/embed_utils.py
			
 
				+
			
 
				 import os
			
 
				+from typing import List  # ← LISA
			
 
				 from dotenv import load_dotenv
			
 
				-from sentence_transformers import SentenceTransformer  # [web:133][web:134][web:135][web:141]
			
 
				+from sentence_transformers import SentenceTransformer
			
 
				 
			
 
				 load_dotenv()
			
 
				 
			
 
				 EMBED_MODEL_NAME = os.getenv("EMBED_MODEL_NAME", "BAAI/bge-small-en-v1.5")
			
 
				-EMBED_DIM = int(os.getenv("EMBED_DIM", "384"))  # bge-small-en-v1.5 [web:134][web:135]
			
 
				+EMBED_DIM = int(os.getenv("EMBED_DIM", "384"))
			
 
				 
			
 
				 _model: SentenceTransformer | None = None
			
 
				 
			
@@ -17,15 +19,14 @@ def get_model() -> SentenceTransformer:
 
				         _model = SentenceTransformer(EMBED_MODEL_NAME)
			
 
				     return _model
			
 
				 
			
 
				-def get_embedding(text: str) -> list[float]:
			
 
				+def get_embedding(text: str) -> List[float]:  # ← PARANDATUD
			
 
				     """
			
 
				     Tagastab ühe tekstijupi embeddingu.
			
 
				-    BGE v1.5 ei vaja tingimata special instruction'it, võib kasutada otse. [web:133][web:135][web:140][web:146]
			
 
				+    BGE v1.5 ei vaja special instruction'it, võib kasutada otse.
			
 
				     """
			
 
				     text = (text or "").strip()
			
 
				     if not text:
			
 
				         return []
			
 
				-
			
 
				     model = get_model()
			
 
				-    emb = model.encode(text, normalize_embeddings=True)  # normalization sobib cosine jaoks [web:133][web:135][web:141]
			
 
				+    emb = model.encode(text, normalize_embeddings=True)
			
 
				     return emb.tolist()
			
--- a/src/generate_answer.py
+++ b/src/generate_answer.py
@@ -1,13 +1,20 @@
 
				 # file: src/generate_answer.py
			
 
				+
			
 
				 import os
			
 
				 import json
			
 
				 import requests
			
 
				+import logging
			
 
				 from typing import List, Dict
			
 
				 from dotenv import load_dotenv
			
 
				 
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				 load_dotenv()
			
 
				 
			
 
				-LLAMA_CPP_URL = os.getenv("LLAMA_CPP_URL", "http://localhost:8070/completion")
			
 
				+# ===== OLULINE: Environment muutuja =====
			
 
				+LLAMA_CPP_URL = os.getenv("LLAMA_CPP_URL", "http://100.87.1.24:8070/completion")
			
 
				+
			
 
				+logger.info(f"🔧 llama.cpp URL: {LLAMA_CPP_URL}")
			
 
				 
			
 
				 def deduplicate_chunks(chunks: List[Dict]) -> List[Dict]:
			
 
				     """Eemaldab duplikaadi chunk'id."""
			
@@ -23,28 +30,21 @@ def deduplicate_chunks(chunks: List[Dict]) -> List[Dict]:
 
				     return unique_chunks
			
 
				 
			
 
				 def build_context(articles: List[Dict], chunks: List[Dict], max_chars: int = 8000) -> str:
			
 
				-    """
			
 
				-    Suurendatud kontekst. Weaviate andmete osa oluliselt suurem.
			
 
				-    """
			
 
				+    """Suurendatud kontekst. Weaviate andmete osa oluliselt suurem."""
			
 
				     context_parts = []
			
 
				     char_count = 0
			
 
				-    
			
 
				     unique_chunks = deduplicate_chunks(chunks)
			
 
				     
			
 
				-    # SUURENDATUD: Weaviate artikli metaandmed
			
 
				     context_parts.append("=== ARTICLE SUMMARIES FROM WEAVIATE ===\n")
			
 
				-    for i, art in enumerate(articles[:8], 1):  # 5 → 8 artiklit!
			
 
				-        
			
 
				-        # Koosta rikkalik artikli info
			
 
				+    
			
 
				+    for i, art in enumerate(articles[:8], 1):
			
 
				         authors_str = ', '.join(art['authors'][:3]) if art['authors'] else 'N/A'
			
 
				-        keywords_str = ', '.join(art['key_concepts'][:6]) if art['key_concepts'] else 'N/A'
			
 
				+        keywords_str = ', '.join(art['key_concepts'][:6]) if art.get('key_concepts') else 'N/A'
			
 
				         methods_str = ', '.join(art['methods_used'][:4]) if art.get('methods_used') else 'N/A'
			
 
				         
			
 
				-        # Parsime transport_context (JSON string)
			
 
				         transport_str = "N/A"
			
 
				         try:
			
 
				             if art.get('transport_context'):
			
 
				-                import json
			
 
				                 tc = json.loads(art['transport_context'])
			
 
				                 theoretical = tc.get('theoretical_contribution', '')
			
 
				                 practical = tc.get('practical_applicability', '')
			
@@ -53,24 +53,26 @@ def build_context(articles: List[Dict], chunks: List[Dict], max_chars: int = 800
 
				             transport_str = art.get('transport_context', '')[:300]
			
 
				         
			
 
				         article_text = f"""[Article {i}] {art['title']}
			
 
				+
			
 
				 Authors: {authors_str}
			
 
				 Year: {art.get('year', 'N/A')}
			
 
				 Keywords: {keywords_str}
			
 
				 Methods Used: {methods_str}
			
 
				 Transport Context: {transport_str}
			
 
				-Summary (Estonian): {art['summary_et'][:900]}  ← 350 → 900 märki!
			
 
				+Summary (Estonian): {art['summary_et'][:900]}
			
 
				 
			
 
				 """
			
 
				         context_parts.append(article_text)
			
 
				         char_count += len(article_text)
			
 
				         
			
 
				-        if char_count > max_chars * 0.45:  # 45% artiklitele
			
 
				+        if char_count > max_chars * 0.45:
			
 
				             break
			
 
				     
			
 
				-    # pgvector excerpts (sama mis enne)
			
 
				     context_parts.append("\n=== TEXT EXCERPTS FROM SOURCES ===\n")
			
 
				+    
			
 
				     for i, chunk in enumerate(unique_chunks[:15], 1):
			
 
				         chunk_text = f"""[Excerpt {i}] ({chunk['filename']}, page {chunk['page']})
			
 
				+
			
 
				 {chunk['text'][:280]}
			
 
				 
			
 
				 """
			
@@ -83,24 +85,24 @@ Summary (Estonian): {art['summary_et'][:900]}  ← 350 → 900 märki!
 
				     return "\n".join(context_parts)
			
 
				 
			
 
				 def generate_answer(query: str, context: str, temperature: float = 0.6, max_tokens: int = 1500) -> str:
			
 
				-    """
			
 
				-    Genereerib vastuse. Sama kui enne, aga kontekst on rikkam.
			
 
				-    """
			
 
				-    
			
 
				+    """Genereerib vastuse. Kasutab llama.cpp REST API."""
			
 
				     prompt = f"""You are an expert in scientific research on transportation safety and traffic engineering.
			
 
				 
			
 
				 CONTEXT (from scientific sources in Weaviate database):
			
 
				+
			
 
				 {context}
			
 
				 
			
 
				 USER QUESTION (in Estonian):
			
 
				+
			
 
				 {query}
			
 
				 
			
 
				 INSTRUCTIONS:
			
 
				+
			
 
				 - Answer in fluent Estonian language
			
 
				 - Use data and findings from the sources provided above
			
 
				 - Cite articles or excerpts by number in square brackets (e.g., "[Article 1]" or "[Excerpt 3]")
			
 
				 - Be concrete, detailed, and factual
			
 
				-- Write at least 250 words (important: use the context fully)
			
 
				+- Write at least 250 words
			
 
				 - Do NOT speculate or add information not present in the context
			
 
				 - Reference the methods, findings, and practical applications mentioned in the articles
			
 
				 - Structure your answer clearly with paragraphs
			
@@ -116,61 +118,61 @@ ANSWER (in Estonian):"""
 
				         "presence_penalty": 0.1,
			
 
				         "frequency_penalty": 0.1,
			
 
				     }
			
 
				-
			
 
				-    print(f"📤 Saandan päring llama.cpp'le...")
			
 
				-    print(f"   📊 Konteksti suurus: {len(context)} märki ≈ {len(context)//4} tokenit")
			
 
				+    
			
 
				+    logger.info(f"📤 Saandan päring llama.cpp'le...")
			
 
				+    logger.info(f" 📊 Konteksti suurus: {len(context)} märki")
			
 
				+    logger.info(f" 🔗 URL: {LLAMA_CPP_URL}")
			
 
				     
			
 
				     try:
			
 
				         response = requests.post(LLAMA_CPP_URL, json=payload, timeout=240)
			
 
				         response.raise_for_status()
			
 
				-        result = response.json()
			
 
				         
			
 
				+        result = response.json()
			
 
				         answer = result.get("content", "").strip()
			
 
				         tokens_predicted = result.get("tokens_predicted", 0)
			
 
				         
			
 
				-        print(f"   ✅ Genereeriud: {tokens_predicted} tokenit, {len(answer.split())} sõna")
			
 
				+        logger.info(f" ✅ Genereeriud: {tokens_predicted} tokenit")
			
 
				         
			
 
				         if not answer:
			
 
				             return "⚠️ LLM tagastas tühja vastuse."
			
 
				         
			
 
				         word_count = len(answer.split())
			
 
				         if word_count < 100:
			
 
				-            print(f"   ⚠️  Liiga lühike ({word_count} sõna) – konteksti ei piisa")
			
 
				-            return f"⚠️ LLM vastus on liiga lühike ({word_count} sõna):\n\n{answer}"
			
 
				+            logger.warning(f" ⚠️ Liiga lühike ({word_count} sõna)")
			
 
				+            return f"⚠️ LLM vastus on liiga lühike:\n\n{answer}"
			
 
				         
			
 
				         return answer
			
 
				-        
			
 
				+    
			
 
				+    except requests.exceptions.ConnectionError as e:
			
 
				+        logger.error(f"❌ Ei saa ühenduda llama.cpp'ga ({LLAMA_CPP_URL})")
			
 
				+        return f"❌ Viga: Ei saa ühenduda llama.cpp'ga port 8070!"
			
 
				+    
			
 
				     except Exception as e:
			
 
				+        logger.error(f"❌ Viga: {e}")
			
 
				         return f"❌ Viga: {e}"
			
 
				 
			
 
				-def rag_query(query: str, articles: List[Dict], chunks: List[Dict], 
			
 
				+def rag_query(query: str, articles: List[Dict], chunks: List[Dict],
			
 
				               temperature: float = 0.6, max_tokens: int = 1500) -> Dict:
			
 
				     """RAG tsükkel."""
			
 
				-    print(f"\n{'='*70}")
			
 
				-    print(f"🔍 RAG PÄRING")
			
 
				-    print(f"{'='*70}")
			
 
				-    print(f"Küsimus: {query}")
			
 
				-    print(f"Artikleid: {len(articles)}, Chunk'e: {len(chunks)}")
			
 
				     
			
 
				-    context = build_context(articles, chunks, max_chars=8000)  # 5000 → 8000
			
 
				-    answer = generate_answer(query, context, temperature=temperature, max_tokens=max_tokens)
			
 
				+    logger.info(f"\n{'='*70}")
			
 
				+    logger.info(f"🔍 RAG PÄRING")
			
 
				+    logger.info(f"{'='*70}")
			
 
				+    logger.info(f"Küsimus: {query}")
			
 
				+    logger.info(f"Artikleid: {len(articles)}, Chunk'e: {len(chunks)}")
			
 
				     
			
 
				-    # DEBUG: salvesta kontekst
			
 
				-    with open("/tmp/rag_context_debug.txt", "w") as f:
			
 
				-        f.write(context)
			
 
				+    context = build_context(articles, chunks, max_chars=8000)
			
 
				+    answer = generate_answer(query, context, temperature=temperature, max_tokens=max_tokens)
			
 
				     
			
 
				     return {
			
 
				         "query": query,
			
 
				         "answer": answer,
			
 
				         "sources": {
			
 
				-            "articles": [
			
 
				-                {
			
 
				-                    "title": a["title"], 
			
 
				-                    "authors": a.get("authors", [])[:3],
			
 
				-                    "score": a.get("score", 0)
			
 
				-                } 
			
 
				-                for a in articles[:8]  # 5 → 8
			
 
				-            ],
			
 
				+            "articles": [{
			
 
				+                "title": a["title"],
			
 
				+                "authors": a.get("authors", [])[:3],
			
 
				+                "score": a.get("score", 0)
			
 
				+            } for a in articles[:8]],
			
 
				             "chunks_used": len(set((c['filename'], c['page']) for c in chunks[:15]))
			
 
				         },
			
 
				         "parameters": {
			
@@ -179,34 +181,3 @@ def rag_query(query: str, articles: List[Dict], chunks: List[Dict],
 
				             "context_weaviate_chars": len("\n".join([a['summary_et'][:900] for a in articles[:8]])),
			
 
				         }
			
 
				     }
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    from src.query_hybrid import hybrid_search
			
 
				-    
			
 
				-    query = "young driver accident risk"
			
 
				-    query = "traffic flow"
			
 
				-    query = "road safety"
			
 
				-    print(f"\n🚀 Käivitan hübriidotsingu...")
			
 
				-    
			
 
				-    results = hybrid_search(query, top_articles=10, top_chunks=20)
			
 
				-    
			
 
				-    print(f"\n🤖 Genereerin RAG vastust...")
			
 
				-    rag_result = rag_query(
			
 
				-        query=results["query"],
			
 
				-        articles=results["articles"],
			
 
				-        chunks=results["chunks"],
			
 
				-        temperature=0.6,
			
 
				-        max_tokens=1500
			
 
				-    )
			
 
				-    
			
 
				-    print("\n" + "=" * 70)
			
 
				-    print("🎯 RAG VASTUS:")
			
 
				-    print("=" * 70)
			
 
				-    print(rag_result["answer"])
			
 
				-    
			
 
				-    print("\n" + "=" * 70)
			
 
				-    print("📊 STATISTIKA:")
			
 
				-    print("=" * 70)
			
 
				-    print(f"Weaviate konteksti: {rag_result['parameters']['context_weaviate_chars']} märki")
			
 
				-    print(f"Kokku konteksti: {rag_result['parameters']['context_chars']} märki")
			
 
				-    print(f"Kokku tokenit: ~{rag_result['parameters']['context_tokens_approx']}")
			
--- a/src/sync_weaviate.py
+++ b/src/sync_weaviate.py
@@ -1,10 +1,17 @@
 
				 # file: src/sync_weaviate.py
			
 
				-# file: src/sync_weaviate.py
			
 
				+
			
 
				 import os
			
 
				-import uuid
			
 
				+import json
			
 
				 from pathlib import Path
			
 
				+from typing import Any, Dict, List, Optional
			
 
				+from datetime import datetime
			
 
				 from dotenv import load_dotenv
			
 
				+
			
 
				 import psycopg2
			
 
				+from psycopg2.extras import execute_values
			
 
				+from psycopg2 import sql
			
 
				+from uuid import UUID
			
 
				+
			
 
				 from weaviate import WeaviateClient
			
 
				 from weaviate.connect import ConnectionParams
			
 
				 
			
@@ -16,7 +23,6 @@ DB_NAME = os.getenv("DB_NAME", "pdf_research")
 
				 DB_USER = os.getenv("DB_USER", "pdf_user")
			
 
				 DB_PASSWORD = os.getenv("DB_PASSWORD")
			
 
				 
			
 
				-# Weaviate - kasuta tegelikku aadressit
			
 
				 WEAVIATE_HOST = os.getenv("WEAVIATE_HOST", "localhost")
			
 
				 WEAVIATE_HTTP_PORT = int(os.getenv("WEAVIATE_HTTP_PORT", "8080"))
			
 
				 WEAVIATE_CLASS = os.getenv("WEAVIATE_CLASS", "ScientificArticle")
			
@@ -32,104 +38,320 @@ def get_db_conn():
 
				 
			
 
				 def get_weaviate_client() -> WeaviateClient:
			
 
				     """Weaviate v4 klient - HTTP ainult"""
			
 
				-    client = WeaviateClient(connection_params=ConnectionParams.from_params(
			
 
				-        http_host=WEAVIATE_HOST,
			
 
				-        http_port=WEAVIATE_HTTP_PORT,
			
 
				-        http_secure=False,
			
 
				-        grpc_host=WEAVIATE_HOST,
			
 
				-        grpc_port=50051,  # vaikimisi gRPC port, ei pea tööle olema
			
 
				-        grpc_secure=False,
			
 
				-    ))
			
 
				+    client = WeaviateClient(
			
 
				+        connection_params=ConnectionParams.from_params(
			
 
				+            http_host=WEAVIATE_HOST,
			
 
				+            http_port=WEAVIATE_HTTP_PORT,
			
 
				+            http_secure=False,
			
 
				+            grpc_host=WEAVIATE_HOST,
			
 
				+            grpc_port=50051,
			
 
				+            grpc_secure=False,
			
 
				+        )
			
 
				+    )
			
 
				     client.connect()
			
 
				     print(f"✓ Ühendatud Weaviate'ga: {WEAVIATE_HOST}:{WEAVIATE_HTTP_PORT}")
			
 
				     return client
			
 
				 
			
 
				-def sync_weaviate_to_postgres():
			
 
				-    """
			
 
				-    Loeb Weaviate'st kõik ScientificArticle objektid,
			
 
				-    otsib vastavad raw_documents read (source_file järgi),
			
 
				-    ja uuendab weaviate_article_id veeru.
			
 
				-    """
			
 
				-    print("Connecting to Weaviate...")
			
 
				-    client = get_weaviate_client()
			
 
				-    
			
 
				-    try:
			
 
				-        is_ready = client.is_ready()
			
 
				-        print(f"✓ Weaviate on valmis: {is_ready}")
			
 
				-    except Exception as e:
			
 
				-        print(f"❌ Weaviate ühenduse viga: {e}")
			
 
				-        client.close()
			
 
				-        return
			
 
				+def clean_nul_bytes(value):
			
 
				+    """Eemalda NULL-baidid kõikidest väärtustest"""
			
 
				+    if isinstance(value, str):
			
 
				+        return value.replace("\x00", "").replace("\x01", "")
			
 
				+    elif isinstance(value, dict):
			
 
				+        return {k: clean_nul_bytes(v) for k, v in value.items()}
			
 
				+    elif isinstance(value, list):
			
 
				+        return [clean_nul_bytes(item) for item in value]
			
 
				+    return value
			
 
				 
			
 
				+def fetch_weaviate_articles() -> List[Dict[str, Any]]:
			
 
				+    """Loeb kõik ScientificArticle objektid Weaviate'st koos metaandmetega."""
			
 
				     print(f"Fetching articles from Weaviate (klass: {WEAVIATE_CLASS})...")
			
 
				     
			
 
				+    client = get_weaviate_client()
			
 
				     try:
			
 
				-        # Weaviate v4: fetch_objects ilma filtrita
			
 
				         collection = client.collections.get(WEAVIATE_CLASS)
			
 
				         
			
 
				-        # Fetch all objects - kasuta suurt limitti
			
 
				         results = collection.query.fetch_objects(
			
 
				             limit=10000,
			
 
				-            return_properties=["source_file", "title"]
			
 
				+            return_properties=[
			
 
				+                "source_file",
			
 
				+                "title",
			
 
				+                "doi",
			
 
				+                "journal",
			
 
				+                "year",
			
 
				+                "authors",
			
 
				+                "key_concepts",
			
 
				+                "methods_used",
			
 
				+                "summary_et",
			
 
				+                "abstract_en",
			
 
				+                "transport_context",
			
 
				+                "relevance_score",
			
 
				+                "processing_date",
			
 
				+                "file_hash",
			
 
				+            ],
			
 
				         )
			
 
				         
			
 
				         print(f"✓ Leitud {len(results.objects)} artiklit Weaviate'st")
			
 
				-    except Exception as e:
			
 
				-        print(f"❌ Weaviate päringu viga: {e}")
			
 
				-        import traceback
			
 
				-        traceback.print_exc()
			
 
				+        
			
 
				+        articles: List[Dict[str, Any]] = []
			
 
				+        for obj in results.objects:
			
 
				+            props = obj.properties or {}
			
 
				+            
			
 
				+            source_file = props.get("source_file", "")
			
 
				+            source_file = source_file.strip()
			
 
				+            
			
 
				+            # Normalize path: eemalda ./data/pdfs/ ja data/pdfs/
			
 
				+            if source_file.startswith("./data/pdfs/"):
			
 
				+                filename = source_file.replace("./data/pdfs/", "")
			
 
				+            elif source_file.startswith("data/pdfs/"):
			
 
				+                filename = source_file.replace("data/pdfs/", "")
			
 
				+            else:
			
 
				+                filename = Path(source_file).name
			
 
				+            
			
 
				+            if not source_file:
			
 
				+                continue
			
 
				+            
			
 
				+            article = {
			
 
				+                "weaviate_id": str(obj.uuid),
			
 
				+                "source_file": source_file,
			
 
				+                "filename": filename,
			
 
				+                "title": props.get("title"),
			
 
				+                "doi": props.get("doi"),
			
 
				+                "journal": props.get("journal"),
			
 
				+                "year": props.get("year"),
			
 
				+                "authors": props.get("authors") or [],
			
 
				+                "key_concepts": props.get("key_concepts") or [],
			
 
				+                "methods_used": props.get("methods_used") or [],
			
 
				+                "summary_et": props.get("summary_et"),
			
 
				+                "abstract_en": props.get("abstract_en"),
			
 
				+                "transport_context": props.get("transport_context"),
			
 
				+                "relevance_score": props.get("relevance_score"),
			
 
				+                "processing_date": props.get("processing_date"),
			
 
				+                "file_hash": clean_nul_bytes(str(props.get("file_hash") if props.get("file_hash") else "")),
			
 
				+            }
			
 
				+            
			
 
				+            articles.append(article)
			
 
				+        
			
 
				+        return articles
			
 
				+    
			
 
				+    finally:
			
 
				         client.close()
			
 
				-        return
			
 
				 
			
 
				+def sync_weaviate_to_postgres():
			
 
				+    """
			
 
				+    Sünkib Weaviate ScientificArticle metaandmed raw_documents tabeliga.
			
 
				+    MATCHING STRATEEGIA (PRIORITEET JÄRJEKORRAS):
			
 
				+    1. Filehash match (kõige usaldusväärne)
			
 
				+    2. Filename match (fallback, kuid võib olla false positive)
			
 
				+    - match: raw_documents.filename == Path(source_file).name
			
 
				+    - uuendab: weaviate_article_id, title, doi, journal, year, authors, key_concepts,
			
 
				+               methods_used, summary_et, abstract_en, transport_context, relevance_score, processing_date
			
 
				+    """
			
 
				+    print("Connecting to PostgreSQL...")
			
 
				     conn = get_db_conn()
			
 
				     cur = conn.cursor()
			
 
				-
			
 
				+    
			
 
				+    # Uus osa
			
 
				+    print("Loading PostgreSQL hash index...")
			
 
				+    cur.execute("SELECT id, filename, file_hash FROM raw_documents WHERE file_hash IS NOT NULL")
			
 
				+    postgres_by_hash = {}
			
 
				+    for row in cur.fetchall():
			
 
				+        doc_id, filename, file_hash = row
			
 
				+        if file_hash:
			
 
				+            postgres_by_hash[file_hash] = (doc_id, filename)
			
 
				+            #print(f"file_hash:  {file_hash}")
			
 
				+    print(f"   Loaded {len(postgres_by_hash)} documents with hashes\n")
			
 
				+        
			
 
				+    synced = 0
			
 
				+    not_found_hash = 0
			
 
				+    not_found_filename = 0
			
 
				+    errors = 0
			
 
				+    
			
 
				+    stats = {
			
 
				+        "hash_matches": 0,
			
 
				+        "filename_matches": 0,
			
 
				+        "filename_fallback": 0,
			
 
				+    }
			
 
				+    # Uue osa lõpp
			
 
				+  
			
 
				+    articles = fetch_weaviate_articles()
			
 
				+    #print(f"Alustan sünkroniseerimist {len(articles)} artikliga...")
			
 
				+    
			
 
				     synced = 0
			
 
				     not_found = 0
			
 
				     errors = 0
			
 
				-
			
 
				-    for obj in results.objects:
			
 
				+    
			
 
				+    for art in articles:
			
 
				         try:
			
 
				-            props = obj.properties
			
 
				-            source_file = props.get("source_file")
			
 
				-            title = props.get("title", "N/A")
			
 
				-            weaviate_id = obj.uuid
			
 
				-
			
 
				-            if not source_file or not weaviate_id:
			
 
				-                print(f"⚠️ Puudub source_file või id: {title}")
			
 
				-                not_found += 1
			
 
				-                continue
			
 
				+            filename = art["filename"]
			
 
				+            weaviate_id = art["weaviate_id"]
			
 
				+            source_file = art["source_file"]
			
 
				+            #print(f"Testimine:  {filename}  -  {weaviate_uuid} -  {source_file}")
			
 
				+            weaviate_hash = art.get("file_hash", "")
			
 
				+            #print(f"Testimine weaviate_hash:  {weaviate_hash},  weaviate_id:  {weaviate_id} ")
			
 
				 
			
 
				-            weaviate_uuid = str(weaviate_id)
			
 
				-            filename = Path(source_file).name
			
 
				-
			
 
				-            cur.execute(
			
 
				-                "SELECT id FROM raw_documents WHERE filename = %s",
			
 
				-                (filename,)
			
 
				-            )
			
 
				-            row = cur.fetchone()
			
 
				-
			
 
				-            if row:
			
 
				-                raw_doc_id = row[0]
			
 
				+            raw_doc_id = None
			
 
				+            match_strategy = None
			
 
				+            
			
 
				+            # STRATEEGIA 1: Filehash match (esmane)
			
 
				+            if weaviate_hash:
			
 
				+                result = postgres_by_hash.get(weaviate_hash)
			
 
				+                if result:
			
 
				+                    raw_doc_id, pg_filename = result
			
 
				+                    match_strategy = "HASH"
			
 
				+                    stats["hash_matches"] += 1
			
 
				+                    print(f"✅ [HASH] {filename}")
			
 
				+                else:
			
 
				+                    not_found_hash += 1
			
 
				+            
			
 
				+            # STRATEEGIA 2: Filename match (fallback)
			
 
				+            if not raw_doc_id:
			
 
				                 cur.execute(
			
 
				-                    "UPDATE raw_documents SET weaviate_article_id = %s WHERE id = %s",
			
 
				-                    (weaviate_uuid, raw_doc_id)
			
 
				+                    "SELECT id FROM raw_documents WHERE filename = %s",
			
 
				+                    (filename,)
			
 
				                 )
			
 
				-                synced += 1
			
 
				-                print(f"✓ {title[:50]}... (pg_id={raw_doc_id})")
			
 
				+                row = cur.fetchone()
			
 
				+                if row and len(row) > 0:
			
 
				+                    raw_doc_id = str(row[0])
			
 
				+                    #print(f"Testimine raw_doc_id:  {raw_doc_id}")
			
 
				+                    match_strategy = "FILENAME"
			
 
				+                    stats["filename_matches"] += 1
			
 
				+                    
			
 
				+                    if not weaviate_hash:
			
 
				+                        stats["filename_fallback"] += 1
			
 
				+                        print(f"⚠️  [FALLBACK] {filename} (filehash puudub)")
			
 
				+                    else:
			
 
				+                        print(f"⚠️  [FALLBACK] {filename} (filehash: {weaviate_hash[:16]}...)")                    
			
 
				+                elif row:
			
 
				+                    print(f"⚠️  Empty tuple returned for {filename}")
			
 
				+                    not_found_filename += 1
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    not_found_filename += 1
			
 
				+                    print(f"❌ [NOT FOUND] {filename}")
			
 
				+                    continue
			
 
				+                        
			
 
				+            # UPDATE raw_documents
			
 
				+            title = clean_nul_bytes(art.get("title", ""))
			
 
				+            doi = clean_nul_bytes(art.get("doi", ""))
			
 
				+            journal = clean_nul_bytes(art.get("journal", ""))
			
 
				+            year = clean_nul_bytes(art.get("year", ""))
			
 
				+            #transport_context = clean_nul_bytes(art.get("transport_context", ""))
			
 
				+            relevance_score = clean_nul_bytes(art.get("relevance_score", ""))
			
 
				+            #processing_date = clean_nul_bytes(art.get("processing_date", ""))
			
 
				+            
			
 
				+            summary_et = clean_nul_bytes(art.get("summary_et", ""))
			
 
				+            abstract_en = clean_nul_bytes(art.get("abstract_en", ""))
			
 
				+            authors = clean_nul_bytes(art.get("authors", ""))
			
 
				+            key_concepts = clean_nul_bytes(art.get("key_concepts", ""))
			
 
				+            methods_used = clean_nul_bytes(art.get("methods_used", ""))
			
 
				+            
			
 
				+            # Transport context JSON
			
 
				+            transport_context_json = None
			
 
				+            if art.get("transport_context"):
			
 
				+                try:
			
 
				+                    if isinstance(art["transport_context"], str):
			
 
				+                        transport_context_json = json.loads(art["transport_context"])
			
 
				+                    else:
			
 
				+                        transport_context_json = art["transport_context"]
			
 
				+                except (json.JSONDecodeError, TypeError):
			
 
				+                    transport_context_json = None
			
 
				+                    
			
 
				+            if transport_context_json:
			
 
				+                transport_context = json.dumps(transport_context_json) 
			
 
				             else:
			
 
				-                not_found += 1
			
 
				-                print(f"⚠️ Ei leitud pgvector'ist: {filename}")
			
 
				+                transport_context = None
			
 
				+            
			
 
				+            # Processing date
			
 
				+            processing_date = None
			
 
				+            if art.get("processing_date"):
			
 
				+                try:
			
 
				+                    if isinstance(art["processing_date"], str):
			
 
				+                        processing_date = datetime.fromisoformat(art["processing_date"])
			
 
				+                    else:
			
 
				+                        processing_date = art["processing_date"]
			
 
				+                except (ValueError, TypeError):
			
 
				+                    processing_date = None          
			
 
				+            
			
 
				+            #print(f"1.weaviate_id {weaviate_id}")
			
 
				+            #print(f"2.title {title}")
			
 
				+            #print(f"3.doi {doi}")
			
 
				+            #print(f"4.journal {journal}")
			
 
				+            #print(f"5.year {year}")
			
 
				+            #print(f"6.authors {authors}")
			
 
				+            #print(f"7.key_concepts {key_concepts}")
			
 
				+            #print(f"8.methods_used {methods_used}")
			
 
				+            #print(f"9.summary_et {summary_et}")
			
 
				+            #print(f"10.transport_context {transport_context_json}")
			
 
				+            #print(f"11.relevance_score {relevance_score}")
			
 
				+            #print(f"12.processing_date {processing_date}")
			
 
				+            
			
 
				+            # Uuenda väljad - NULL handling on õige nüüd
			
 
				+            cur.execute(
			
 
				+                """
			
 
				+                UPDATE raw_documents
			
 
				+                SET
			
 
				+                    weaviate_article_id = %s,
			
 
				+                    title = COALESCE(%s, title),
			
 
				+                    doi = COALESCE(%s, doi),
			
 
				+                    journal = COALESCE(%s, journal),
			
 
				+                    year = COALESCE(%s, year),
			
 
				+                    authors = COALESCE(%s, authors),
			
 
				+                    key_concepts = COALESCE(%s, key_concepts),
			
 
				+                    methods_used = COALESCE(%s, methods_used),
			
 
				+                    summary_et = COALESCE(%s, summary_et),
			
 
				+                    abstract_en = COALESCE(%s, abstract_en),
			
 
				+                    transport_context = COALESCE(%s::jsonb, transport_context),
			
 
				+                    relevance_score = COALESCE(%s, relevance_score),
			
 
				+                    processing_date = COALESCE(%s, processing_date)
			
 
				+                WHERE id = %s
			
 
				+                """,
			
 
				+                (
			
 
				+                    weaviate_id,
			
 
				+                    title,
			
 
				+                    title,
			
 
				+                    art.get("journal"),
			
 
				+                    art.get("year"),
			
 
				+                    authors,
			
 
				+                    key_concepts,
			
 
				+                    methods_used,
			
 
				+                    summary_et,
			
 
				+                    abstract_en,
			
 
				+                    transport_context,
			
 
				+                    art.get("relevance_score"),
			
 
				+                    processing_date,
			
 
				+                    raw_doc_id
			
 
				+                ),
			
 
				+            )
			
 
				+            
			
 
				+            synced += 1
			
 
				+            print(f"✓ {art['title'][:60]}... (filename={filename}, pg_id={raw_doc_id})")
			
 
				+        
			
 
				         except Exception as e:
			
 
				             errors += 1
			
 
				-            print(f"❌ Viga objektiga: {e}")
			
 
				-
			
 
				+            print(f"❌ Viga {art.get('title', 'N/A')} (file={art.get('filename')}): {e}")
			
 
				+            # ✅ OLULINE: Rollback selle vea jaoks!
			
 
				+            conn.rollback()
			
 
				+            # Ühenda uuesti
			
 
				+            cur.close()
			
 
				+            conn.close()
			
 
				+            conn = get_db_conn()
			
 
				+            cur = conn.cursor()
			
 
				+    
			
 
				     conn.commit()
			
 
				     cur.close()
			
 
				     conn.close()
			
 
				-    client.close()
			
 
				-
			
 
				+    
			
 
				     print(f"\n✓ Kokkuvõte: {synced} sünkroniseeritud, {not_found} ei leitud, {errors} viga")
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     sync_weaviate_to_postgres()
			
 
				+    # Lihtne debug: prindi olemasolevad Weaviate artiklid
			
 
				+    articles = fetch_weaviate_articles()
			
 
				+    print(f"Leitud {len(articles)} artiklit Weaviate'st")
			
 
				+    #for art in articles:
			
 
				+    #    print(
			
 
				+    #        f"WeaviateID={art['weaviate_id']} "
			
 
				+    #        f"filename={art['filename']} "
			
 
				+    #        f"doi={art.get('doi')} "
			
 
				+    #        f"title={art.get('title')[:80] if art.get('title') else ''}"
			
 
				+    #    )