Bläddra i källkod

Töötav kood koos weaviate ekspordiga

ardo 3 månader sedan
förälder
incheckning
31e2b690f3

+ 41 - 0
.env

@@ -0,0 +1,41 @@
+# PostgreSQL
+DB_HOST=100.87.1.24
+DB_PORT=5432
+DB_NAME=pdf_research
+DB_USER=osm
+DB_PASSWORD=osm
+DATABASE_URL='postgresql://osm:osm@100.87.1.24:5432/pdf_research'
+
+
+# DeepSeek (hiljem)
+DEEPSEEK_API_KEY=sk-c6766d328c2446f78bfe509d3c7ad4b3
+EMBED_MODEL=deepseek-embedding-v2
+EMBED_MODEL_NAME=BAAI/bge-small-en-v1.5
+
+LLAMA_CPP_URL=http://100.87.1.24:8070/completion
+LLAMA_EMBED_BIN=/home/ardo/llama.cpp/build/bin/llama-embedding
+LLAMA_EMBED_MODEL=/models/mistral-7b-v0.1.Q4_K_M.gguf
+EMBED_DIM=384
+
+# Open AI
+OPENAI_API_KEY=sk-None-opa0spEeiLKr0wsQCDa9T3BlbkFJbkv6l9uZUA6xNTEiBK5R
+
+# Projektikseaded
+PDF_INPUT_DIR=data/raw_pdfs
+EXTRACTED_OUTPUT_DIR=data/extracted
+PROCESSED_OUTPUT_DIR=data/processed
+CHUNKS_OUTPUT_DIR=data/chunks
+
+CHUNK_SIZE=1024
+CHUNK_OVERLAP=200
+
+# Weaviate
+#WEAVIATE_URL=http://localhost:8080
+WEAVIATE_HOST=100.87.1.24
+WEAVIATE_HTTP_PORT=8080
+WEAVIATE_URL=http://100.87.1.24:8080
+WEAVIATE_API_KEY=
+WEAVIATE_CLASS=ScientificArticle
+
+# Ollama
+OLLAMA_URL=http://100.87.1.24:11434

+ 2 - 1
.gitignore

@@ -58,7 +58,7 @@ docs/_build/
 # PyBuilder
 target/
 
-# Ardo
+# ArdoS
 testimiseks/
 data/chunks
 data/extracted
@@ -66,3 +66,4 @@ data/processed
 data/raw_pdfs
 archive/
 output/
+help/tmp

+ 6 - 2
LOEMIND.md

@@ -14,11 +14,17 @@ python -m src.extract_pdf  # parandatud ekstraheerimisega
 python -m src.check_status  # parema ülevaatega
 
 python -m src.find_duplicates  # duplikaatide kontroll
+# dokumentide protsessimine
 python -m src.clean_and_normalize
 
 python -m src.create_chunks
+# Järgmises koodis on parameeter BATCH_LIMIT
+# Nii kaua tuleb käivitada koodi kuni kõik embeddingud on lisatud
 python -m src.embed_chunks
 
+# Weaviate baasiga sünkroniseerimine
+python -m src.sync_weaviate
+
 # Testimine
 docker exec -it postgres_postgis psql -U osm -d pdf_research -h localhost -c "SELECT COUNT(*) FROM processed_documents;"
 docker exec -it postgres_postgis psql -U osm -d pdf_research -h localhost -c "
@@ -28,8 +34,6 @@ docker exec -it postgres_postgis psql -U osm -d pdf_research -h localhost -c "
   LIMIT 20;
 "
 
-# Weaviate baasiga sünkroniseerimine
-python -m src.sync_weaviate
 
 python -m src.query_hybrid
 python -m src.generate_answer

+ 3 - 0
help/huvitavate_asjade_nimekiri.md

@@ -0,0 +1,3 @@
+# Huvitavad asjad
+
+1. Gravitatsiooni artikkel - 1009.6090v1.pdf

+ 208 - 0
jupyter/weaviate_testimine.ipynb

@@ -0,0 +1,208 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "4eda7add",
+   "metadata": {},
+   "source": [
+    "# Weaviate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4956326d",
+   "metadata": {},
+   "source": [
+    "## Ühenduse loomine Weaviate serveriga"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "6f1ebd05",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from weaviate import WeaviateClient\n",
+    "from weaviate.connect import ConnectionParams\n",
+    "from weaviate.classes.query import Filter\n",
+    "\n",
+    "from pathlib import Path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac2f77bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Ühenda HTTP abil\n",
+    "client = WeaviateClient(\n",
+    "    connection_params=ConnectionParams.from_params(\n",
+    "        http_host=\"100.87.1.24\",\n",
+    "        http_port=8080,\n",
+    "        http_secure=False,\n",
+    "        grpc_host=\"100.87.1.24\",\n",
+    "        grpc_port=50051,\n",
+    "        grpc_secure=False,\n",
+    "    )\n",
+    ")\n",
+    "client.connect()\n",
+    "\n",
+    "print(f\"✓ Ühendatud Weaviate'ga: {client.is_ready()}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "49994031",
+   "metadata": {},
+   "source": [
+    "## Muud käsud"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9201cb41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection = client.collections.get(\"ScientificArticle\")\n",
+    "results = collection.query.fetch_objects(limit=10)\n",
+    "print(f\"Leitud {len(results.objects)} objekti\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "93b0b0e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(results)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "c3a7bbbb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection = client.collections.get('ScientificArticle')\n",
+    "results = collection.query.fetch_objects(limit=2)\n",
+    "\n",
+    "for obj in results.objects:\n",
+    "    props = obj.properties\n",
+    "    print(f\"Title: {props.get('title')}, DOI: {props.get('doi')}, source_file: {props.get('source_file')}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "07970b96",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "collection = client.collections.get('ScientificArticle')\n",
+    "results = collection.query.fetch_objects(limit=2)\n",
+    "\n",
+    "for obj in results.objects:\n",
+    "    props = obj.properties\n",
+    "    filename = props.get('source_file')\n",
+    "    print(f\"filename: {filename}\")\n",
+    "    source_file = filename.strip()\n",
+    "    if source_file.startswith(\"./data/pdfs/\"):\n",
+    "        filename = source_file.replace(\"./data/pdfs/\", \"\")\n",
+    "        if source_file.startswith(\"data/pdfs/\"):\n",
+    "            filename = source_file.replace(\"data/pdfs/\", \"\")\n",
+    "        else:\n",
+    "            filename = Path(source_file).name\n",
+    "    print(f\"filename: {filename}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "6ccfa014",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Leia artiklid, mis sisaldavad NUL-baite\n",
+    "problem_files = [\n",
+    "    'Shaver-StatisticalSignificanceTesting-1993.pdf',\n",
+    "    'Safety-effectiveness-of-forward-collision-warning-syste2025Accident-Analys.pdf',\n",
+    "    '1910.13885v1.pdf',\n",
+    "    '2503.14666v1.pdf',\n",
+    "    '2503.14914v1.pdf'\n",
+    "]\n",
+    "\n",
+    "collection = client.collections.get('ScientificArticle')\n",
+    "results = collection.query.fetch_objects(limit=900)\n",
+    "\n",
+    "for obj in results.objects:\n",
+    "    props = obj.properties\n",
+    "    filename = props.get('source_file')\n",
+    "    source_file = filename.strip()\n",
+    "    if source_file.startswith(\"./data/pdfs/\"):\n",
+    "        filename = source_file.replace(\"./data/pdfs/\", \"\")\n",
+    "        if source_file.startswith(\"data/pdfs/\"):\n",
+    "            filename = source_file.replace(\"data/pdfs/\", \"\")\n",
+    "        else:\n",
+    "            filename = Path(source_file).name\n",
+    "\n",
+    "    for problem_filename in problem_files:\n",
+    "        if problem_filename == filename:\n",
+    "            print(f\"clean filename: {filename}\")\n",
+    "            article = props.get('title')\n",
+    "            article_id = props.get('article_id')\n",
+    "\n",
+    "            print(f\"Artikkel: {article} - {article_id}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "86adec54",
+   "metadata": {},
+   "source": [
+    "## Sulge ühendused"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e654ae89",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "client.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a93bd44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Kasuta\n",
+    "try:\n",
+    "    collection = client.collections.get(\"ScientificArticle\")\n",
+    "    results = collection.query.fetch_objects(limit=10)\n",
+    "    print(f\"Leitud {len(results.objects)} objekti\")\n",
+    "finally:\n",
+    "    client.close()\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "PDF Pipeline (venv)",
+   "language": "python",
+   "name": "pdf-env"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

+ 16 - 2
scripts/db_vector_välja_muutmine.sh

@@ -16,11 +16,24 @@ docker exec -it postgres_postgis bash -c "
   \"
 "
 
-# Lisa raw_documents tabelisse weaviate_article_id veerg
+# Lisa raw_documents tabelisse täiendavad veerud weaviate baasist ülekandmiseks
 docker exec -it postgres_postgis bash -c "
   export PGPASSWORD='osm' &&
   psql -U osm -h localhost -d pdf_research -c \"
-ALTER TABLE raw_documents ADD COLUMN weaviate_article_id UUID;
+ALTER TABLE raw_documents
+  ADD COLUMN IF NOT EXISTS weaviate_article_id UUID,
+  ADD COLUMN IF NOT EXISTS title TEXT,
+  ADD COLUMN IF NOT EXISTS doi TEXT,
+  ADD COLUMN IF NOT EXISTS journal TEXT,
+  ADD COLUMN IF NOT EXISTS year REAL,
+  ADD COLUMN IF NOT EXISTS authors TEXT[],
+  ADD COLUMN IF NOT EXISTS key_concepts TEXT[],
+  ADD COLUMN IF NOT EXISTS methods_used TEXT[],
+  ADD COLUMN IF NOT EXISTS summary_et TEXT,
+  ADD COLUMN IF NOT EXISTS abstract_en TEXT,
+  ADD COLUMN IF NOT EXISTS transport_context JSONB,
+  ADD COLUMN IF NOT EXISTS relevance_score REAL,
+  ADD COLUMN IF NOT EXISTS processing_date TIMESTAMPTZ;
 CREATE INDEX idx_raw_weaviate_id ON raw_documents(weaviate_article_id);
   \"
 "
@@ -31,3 +44,4 @@ docker exec -it postgres_postgis bash -c "
   SELECT column_name, data_type FROM information_schema.columns WHERE table_name = 'raw_documents' ORDER BY ordinal_position;
   \"
 "
+

+ 130 - 53
src/clean_and_normalize.py

@@ -1,6 +1,8 @@
 # file: src/clean_and_normalize.py
+
 import os
 import json
+import re
 from pathlib import Path
 from dotenv import load_dotenv
 import psycopg2
@@ -25,81 +27,156 @@ def get_db_conn():
     )
 
 def clean_text_basic(text: str) -> str:
-    # Väga lihtne puhastus: trimmime, asendame mitmekordsed tühikud ühega
-    import re
+    """
+    Puhastab tekstist ohtlikud karakterid:
+    - NUL baited (0x00)
+    - Juhtimiskarakateid
+    - Mitmekordseid tühikuid
+    - Mitmekordseid reavahetus
+    """
     if not text:
         return ""
+    
+    # ⚠️ OLULINE: Eemalda NUL baited
+    text = text.replace("\x00", "")
+    
+    # Eemalda muud juhtimiskarakateid (jäta \n, \t, \r)
+    # Loe ainult nähtavaid karaktereid
+    text = "".join(
+        ch for ch in text 
+        if ord(ch) >= 32 or ch in "\n\t\r"
+    )
+    
+    # Normaalne puhastus
     text = text.replace("\r", "\n")
-    text = text.replace("\u00a0", " ")  # non‑breaking space
-    text = re.sub(r"[ \t]+", " ", text)
-    text = re.sub(r"\n{3,}", "\n\n", text)
+    text = text.replace("\u00a0", " ")  # non-breaking space
+    text = re.sub(r"[ \t]+", " ", text)  # mitmekordsed tühikud
+    text = re.sub(r"\n{3,}", "\n\n", text)  # mitmekordsed reavahetus
+    
     return text.strip()
 
 def process_single_doc(raw_doc_id: int, filename: str):
+    """
+    Töötleb ühe dokumendi pages.json failist ja sisestab 
+    cleaned teksti processed_documents tabelisse.
+    """
     pdf_stem = Path(filename).stem
     json_path = EXTRACTED_OUTPUT_DIR / pdf_stem / "pages.json"
-
+    
     if not json_path.exists():
         print(f"❌ pages.json puudub: {json_path}")
         return
-
-    pages_data = json.loads(json_path.read_text())
-
+    
+    # JSON laadimine error handling-ga
+    try:
+        # Kasuta errors='replace' et käidelda vigaseid baite
+        raw_json_content = json_path.read_text(encoding='utf-8', errors='replace')
+        pages_data = json.loads(raw_json_content)
+    except json.JSONDecodeError as e:
+        print(f"❌ JSON parse error {filename}: {e}")
+        return
+    except Exception as e:
+        print(f"❌ Viga JSON laadimisel {filename}: {e}")
+        return
+    
     conn = get_db_conn()
     cur = conn.cursor()
-
     inserted = 0
-
-    for page in pages_data:
-        page_num = page["page"]
-        raw_text = page.get("text") or ""
-        text = clean_text_basic(raw_text)
-
-        if not text:
-            print(f"  ⚠️ Leht {page_num}: tühi tekst, jätan vahele")
-            continue
-
-        cur.execute("""
-            INSERT INTO processed_documents
-                (raw_doc_id, page, section, content_text, has_table, table_data, bbox)
-            VALUES
-                (%s, %s, %s, %s, %s, %s, %s)
-        """, (
-            raw_doc_id,
-            page_num,
-            f"page_{page_num}",
-            text,
-            False,
-            None,
-            None,
-        ))
-        inserted += 1
-
-    conn.commit()
-    cur.close()
-    conn.close()
-
-    print(f"✓ {filename}: lisatud {inserted} rida processed_documents tabelisse")
+    failed = 0
+    
+    try:
+        for page_data in pages_data:
+            try:
+                page_num = page_data.get("page", 0)
+                raw_text = page_data.get("text") or ""
+                
+                # Puhasta teksti
+                text = clean_text_basic(raw_text)
+                
+                if not text:
+                    print(f"  ⚠️ Leht {page_num}: tühi tekst pärast puhastamist")
+                    failed += 1
+                    continue
+                
+                # Lisa kaitse: kontrolli, kas jälle NUL baite
+                if "\x00" in text:
+                    print(f"  ⚠️ Leht {page_num}: NUL baite pärast puhastamist?")
+                    text = text.replace("\x00", "")
+                
+                # INSERT processed_documents-i
+                cur.execute("""
+                    INSERT INTO processed_documents
+                    (raw_doc_id, page, section, content_text, has_table, table_data, bbox)
+                    VALUES
+                    (%s, %s, %s, %s, %s, %s, %s)
+                """, (
+                    raw_doc_id,
+                    page_num,
+                    f"page_{page_num}",
+                    text,
+                    False,
+                    None,
+                    None,
+                ))
+                
+                inserted += 1
+                
+            except ValueError as e:
+                # NUL byte viga
+                if "NUL" in str(e):
+                    print(f"  ⚠️ Leht {page_num}: NUL byte error - jätan vahele")
+                    failed += 1
+                else:
+                    raise
+            
+            except Exception as e:
+                print(f"  ⚠️ Leht {page_num} error: {type(e).__name__}: {e}")
+                failed += 1
+                continue
+        
+        # Kõik insertid tehtud, üks commit
+        if inserted > 0:
+            conn.commit()
+        
+    except Exception as e:
+        conn.rollback()
+        print(f"❌ Viga {filename} töötlemises: {e}")
+        return
+    
+    finally:
+        cur.close()
+        conn.close()
+    
+    print(f"✓ {filename}: lisatud {inserted} rida, {failed} vahele jäänud")
 
 def process_all_docs():
+    """Töötleb kõik raw_documents failid."""
     conn = get_db_conn()
     cur = conn.cursor()
-
-    # Võtame kõik raw dokumendid
-    cur.execute("SELECT id, filename FROM raw_documents ORDER BY id")
-    docs = cur.fetchall()
-    cur.close()
-    conn.close()
-
+    
+    try:
+        # Võtame kõik raw dokumendid
+        cur.execute("SELECT id, filename FROM raw_documents ORDER BY id")
+        docs = cur.fetchall()
+    except Exception as e:
+        print(f"❌ Viga dokumentide loetlemises: {e}")
+        return
+    finally:
+        cur.close()
+        conn.close()
+    
     if not docs:
         print("❌ raw_documents on tühi, esmalt käivita extract_pdf.py")
         return
-
-    for raw_doc_id, filename in docs:
-        print(f"🔧 Töötlen: {filename} (raw_doc_id={raw_doc_id})")
+    
+    total = len(docs)
+    for idx, (raw_doc_id, filename) in enumerate(docs, 1):
+        print(f"\n🔧 Töötlen: {filename} (raw_doc_id={raw_doc_id}) [{idx}/{total}]")
         process_single_doc(raw_doc_id, filename)
-
-    print("✓ Kõik dokumendid töödeldud (lehe‑tasemel tekstina).")
+    
+    print("\n" + "="*70)
+    print("✓ Kõik dokumendid töödeldud (lehe-tasemel tekstina).")
+    print("="*70)
 
 if __name__ == "__main__":
     process_all_docs()

+ 7 - 6
src/embed_utils.py

@@ -1,12 +1,14 @@
 # file: src/embed_utils.py
+
 import os
+from typing import List  # ← LISA
 from dotenv import load_dotenv
-from sentence_transformers import SentenceTransformer  # [web:133][web:134][web:135][web:141]
+from sentence_transformers import SentenceTransformer
 
 load_dotenv()
 
 EMBED_MODEL_NAME = os.getenv("EMBED_MODEL_NAME", "BAAI/bge-small-en-v1.5")
-EMBED_DIM = int(os.getenv("EMBED_DIM", "384"))  # bge-small-en-v1.5 [web:134][web:135]
+EMBED_DIM = int(os.getenv("EMBED_DIM", "384"))
 
 _model: SentenceTransformer | None = None
 
@@ -17,15 +19,14 @@ def get_model() -> SentenceTransformer:
         _model = SentenceTransformer(EMBED_MODEL_NAME)
     return _model
 
-def get_embedding(text: str) -> list[float]:
+def get_embedding(text: str) -> List[float]:  # ← PARANDATUD
     """
     Tagastab ühe tekstijupi embeddingu.
-    BGE v1.5 ei vaja tingimata special instruction'it, võib kasutada otse. [web:133][web:135][web:140][web:146]
+    BGE v1.5 ei vaja special instruction'it, võib kasutada otse.
     """
     text = (text or "").strip()
     if not text:
         return []
-
     model = get_model()
-    emb = model.encode(text, normalize_embeddings=True)  # normalization sobib cosine jaoks [web:133][web:135][web:141]
+    emb = model.encode(text, normalize_embeddings=True)
     return emb.tolist()

+ 49 - 78
src/generate_answer.py

@@ -1,13 +1,20 @@
 # file: src/generate_answer.py
+
 import os
 import json
 import requests
+import logging
 from typing import List, Dict
 from dotenv import load_dotenv
 
+logger = logging.getLogger(__name__)
+
 load_dotenv()
 
-LLAMA_CPP_URL = os.getenv("LLAMA_CPP_URL", "http://localhost:8070/completion")
+# ===== OLULINE: Environment muutuja =====
+LLAMA_CPP_URL = os.getenv("LLAMA_CPP_URL", "http://100.87.1.24:8070/completion")
+
+logger.info(f"🔧 llama.cpp URL: {LLAMA_CPP_URL}")
 
 def deduplicate_chunks(chunks: List[Dict]) -> List[Dict]:
     """Eemaldab duplikaadi chunk'id."""
@@ -23,28 +30,21 @@ def deduplicate_chunks(chunks: List[Dict]) -> List[Dict]:
     return unique_chunks
 
 def build_context(articles: List[Dict], chunks: List[Dict], max_chars: int = 8000) -> str:
-    """
-    Suurendatud kontekst. Weaviate andmete osa oluliselt suurem.
-    """
+    """Suurendatud kontekst. Weaviate andmete osa oluliselt suurem."""
     context_parts = []
     char_count = 0
-    
     unique_chunks = deduplicate_chunks(chunks)
     
-    # SUURENDATUD: Weaviate artikli metaandmed
     context_parts.append("=== ARTICLE SUMMARIES FROM WEAVIATE ===\n")
-    for i, art in enumerate(articles[:8], 1):  # 5 → 8 artiklit!
-        
-        # Koosta rikkalik artikli info
+    
+    for i, art in enumerate(articles[:8], 1):
         authors_str = ', '.join(art['authors'][:3]) if art['authors'] else 'N/A'
-        keywords_str = ', '.join(art['key_concepts'][:6]) if art['key_concepts'] else 'N/A'
+        keywords_str = ', '.join(art['key_concepts'][:6]) if art.get('key_concepts') else 'N/A'
         methods_str = ', '.join(art['methods_used'][:4]) if art.get('methods_used') else 'N/A'
         
-        # Parsime transport_context (JSON string)
         transport_str = "N/A"
         try:
             if art.get('transport_context'):
-                import json
                 tc = json.loads(art['transport_context'])
                 theoretical = tc.get('theoretical_contribution', '')
                 practical = tc.get('practical_applicability', '')
@@ -53,24 +53,26 @@ def build_context(articles: List[Dict], chunks: List[Dict], max_chars: int = 800
             transport_str = art.get('transport_context', '')[:300]
         
         article_text = f"""[Article {i}] {art['title']}
+
 Authors: {authors_str}
 Year: {art.get('year', 'N/A')}
 Keywords: {keywords_str}
 Methods Used: {methods_str}
 Transport Context: {transport_str}
-Summary (Estonian): {art['summary_et'][:900]}  ← 350 → 900 märki!
+Summary (Estonian): {art['summary_et'][:900]}
 
 """
         context_parts.append(article_text)
         char_count += len(article_text)
         
-        if char_count > max_chars * 0.45:  # 45% artiklitele
+        if char_count > max_chars * 0.45:
             break
     
-    # pgvector excerpts (sama mis enne)
     context_parts.append("\n=== TEXT EXCERPTS FROM SOURCES ===\n")
+    
     for i, chunk in enumerate(unique_chunks[:15], 1):
         chunk_text = f"""[Excerpt {i}] ({chunk['filename']}, page {chunk['page']})
+
 {chunk['text'][:280]}
 
 """
@@ -83,24 +85,24 @@ Summary (Estonian): {art['summary_et'][:900]}  ← 350 → 900 märki!
     return "\n".join(context_parts)
 
 def generate_answer(query: str, context: str, temperature: float = 0.6, max_tokens: int = 1500) -> str:
-    """
-    Genereerib vastuse. Sama kui enne, aga kontekst on rikkam.
-    """
-    
+    """Genereerib vastuse. Kasutab llama.cpp REST API."""
     prompt = f"""You are an expert in scientific research on transportation safety and traffic engineering.
 
 CONTEXT (from scientific sources in Weaviate database):
+
 {context}
 
 USER QUESTION (in Estonian):
+
 {query}
 
 INSTRUCTIONS:
+
 - Answer in fluent Estonian language
 - Use data and findings from the sources provided above
 - Cite articles or excerpts by number in square brackets (e.g., "[Article 1]" or "[Excerpt 3]")
 - Be concrete, detailed, and factual
-- Write at least 250 words (important: use the context fully)
+- Write at least 250 words
 - Do NOT speculate or add information not present in the context
 - Reference the methods, findings, and practical applications mentioned in the articles
 - Structure your answer clearly with paragraphs
@@ -116,61 +118,61 @@ ANSWER (in Estonian):"""
         "presence_penalty": 0.1,
         "frequency_penalty": 0.1,
     }
-
-    print(f"📤 Saandan päring llama.cpp'le...")
-    print(f"   📊 Konteksti suurus: {len(context)} märki ≈ {len(context)//4} tokenit")
+    
+    logger.info(f"📤 Saandan päring llama.cpp'le...")
+    logger.info(f" 📊 Konteksti suurus: {len(context)} märki")
+    logger.info(f" 🔗 URL: {LLAMA_CPP_URL}")
     
     try:
         response = requests.post(LLAMA_CPP_URL, json=payload, timeout=240)
         response.raise_for_status()
-        result = response.json()
         
+        result = response.json()
         answer = result.get("content", "").strip()
         tokens_predicted = result.get("tokens_predicted", 0)
         
-        print(f"   ✅ Genereeriud: {tokens_predicted} tokenit, {len(answer.split())} sõna")
+        logger.info(f" ✅ Genereeriud: {tokens_predicted} tokenit")
         
         if not answer:
             return "⚠️ LLM tagastas tühja vastuse."
         
         word_count = len(answer.split())
         if word_count < 100:
-            print(f"   ⚠️  Liiga lühike ({word_count} sõna) – konteksti ei piisa")
-            return f"⚠️ LLM vastus on liiga lühike ({word_count} sõna):\n\n{answer}"
+            logger.warning(f" ⚠️ Liiga lühike ({word_count} sõna)")
+            return f"⚠️ LLM vastus on liiga lühike:\n\n{answer}"
         
         return answer
-        
+    
+    except requests.exceptions.ConnectionError as e:
+        logger.error(f"❌ Ei saa ühenduda llama.cpp'ga ({LLAMA_CPP_URL})")
+        return f"❌ Viga: Ei saa ühenduda llama.cpp'ga port 8070!"
+    
     except Exception as e:
+        logger.error(f"❌ Viga: {e}")
         return f"❌ Viga: {e}"
 
-def rag_query(query: str, articles: List[Dict], chunks: List[Dict], 
+def rag_query(query: str, articles: List[Dict], chunks: List[Dict],
               temperature: float = 0.6, max_tokens: int = 1500) -> Dict:
     """RAG tsükkel."""
-    print(f"\n{'='*70}")
-    print(f"🔍 RAG PÄRING")
-    print(f"{'='*70}")
-    print(f"Küsimus: {query}")
-    print(f"Artikleid: {len(articles)}, Chunk'e: {len(chunks)}")
     
-    context = build_context(articles, chunks, max_chars=8000)  # 5000 → 8000
-    answer = generate_answer(query, context, temperature=temperature, max_tokens=max_tokens)
+    logger.info(f"\n{'='*70}")
+    logger.info(f"🔍 RAG PÄRING")
+    logger.info(f"{'='*70}")
+    logger.info(f"Küsimus: {query}")
+    logger.info(f"Artikleid: {len(articles)}, Chunk'e: {len(chunks)}")
     
-    # DEBUG: salvesta kontekst
-    with open("/tmp/rag_context_debug.txt", "w") as f:
-        f.write(context)
+    context = build_context(articles, chunks, max_chars=8000)
+    answer = generate_answer(query, context, temperature=temperature, max_tokens=max_tokens)
     
     return {
         "query": query,
         "answer": answer,
         "sources": {
-            "articles": [
-                {
-                    "title": a["title"], 
-                    "authors": a.get("authors", [])[:3],
-                    "score": a.get("score", 0)
-                } 
-                for a in articles[:8]  # 5 → 8
-            ],
+            "articles": [{
+                "title": a["title"],
+                "authors": a.get("authors", [])[:3],
+                "score": a.get("score", 0)
+            } for a in articles[:8]],
             "chunks_used": len(set((c['filename'], c['page']) for c in chunks[:15]))
         },
         "parameters": {
@@ -179,34 +181,3 @@ def rag_query(query: str, articles: List[Dict], chunks: List[Dict],
             "context_weaviate_chars": len("\n".join([a['summary_et'][:900] for a in articles[:8]])),
         }
     }
-
-if __name__ == "__main__":
-    from src.query_hybrid import hybrid_search
-    
-    query = "young driver accident risk"
-    query = "traffic flow"
-    query = "road safety"
-    print(f"\n🚀 Käivitan hübriidotsingu...")
-    
-    results = hybrid_search(query, top_articles=10, top_chunks=20)
-    
-    print(f"\n🤖 Genereerin RAG vastust...")
-    rag_result = rag_query(
-        query=results["query"],
-        articles=results["articles"],
-        chunks=results["chunks"],
-        temperature=0.6,
-        max_tokens=1500
-    )
-    
-    print("\n" + "=" * 70)
-    print("🎯 RAG VASTUS:")
-    print("=" * 70)
-    print(rag_result["answer"])
-    
-    print("\n" + "=" * 70)
-    print("📊 STATISTIKA:")
-    print("=" * 70)
-    print(f"Weaviate konteksti: {rag_result['parameters']['context_weaviate_chars']} märki")
-    print(f"Kokku konteksti: {rag_result['parameters']['context_chars']} märki")
-    print(f"Kokku tokenit: ~{rag_result['parameters']['context_tokens_approx']}")

+ 290 - 68
src/sync_weaviate.py

@@ -1,10 +1,17 @@
 # file: src/sync_weaviate.py
-# file: src/sync_weaviate.py
+
 import os
-import uuid
+import json
 from pathlib import Path
+from typing import Any, Dict, List, Optional
+from datetime import datetime
 from dotenv import load_dotenv
+
 import psycopg2
+from psycopg2.extras import execute_values
+from psycopg2 import sql
+from uuid import UUID
+
 from weaviate import WeaviateClient
 from weaviate.connect import ConnectionParams
 
@@ -16,7 +23,6 @@ DB_NAME = os.getenv("DB_NAME", "pdf_research")
 DB_USER = os.getenv("DB_USER", "pdf_user")
 DB_PASSWORD = os.getenv("DB_PASSWORD")
 
-# Weaviate - kasuta tegelikku aadressit
 WEAVIATE_HOST = os.getenv("WEAVIATE_HOST", "localhost")
 WEAVIATE_HTTP_PORT = int(os.getenv("WEAVIATE_HTTP_PORT", "8080"))
 WEAVIATE_CLASS = os.getenv("WEAVIATE_CLASS", "ScientificArticle")
@@ -32,104 +38,320 @@ def get_db_conn():
 
 def get_weaviate_client() -> WeaviateClient:
     """Weaviate v4 klient - HTTP ainult"""
-    client = WeaviateClient(connection_params=ConnectionParams.from_params(
-        http_host=WEAVIATE_HOST,
-        http_port=WEAVIATE_HTTP_PORT,
-        http_secure=False,
-        grpc_host=WEAVIATE_HOST,
-        grpc_port=50051,  # vaikimisi gRPC port, ei pea tööle olema
-        grpc_secure=False,
-    ))
+    client = WeaviateClient(
+        connection_params=ConnectionParams.from_params(
+            http_host=WEAVIATE_HOST,
+            http_port=WEAVIATE_HTTP_PORT,
+            http_secure=False,
+            grpc_host=WEAVIATE_HOST,
+            grpc_port=50051,
+            grpc_secure=False,
+        )
+    )
     client.connect()
     print(f"✓ Ühendatud Weaviate'ga: {WEAVIATE_HOST}:{WEAVIATE_HTTP_PORT}")
     return client
 
-def sync_weaviate_to_postgres():
-    """
-    Loeb Weaviate'st kõik ScientificArticle objektid,
-    otsib vastavad raw_documents read (source_file järgi),
-    ja uuendab weaviate_article_id veeru.
-    """
-    print("Connecting to Weaviate...")
-    client = get_weaviate_client()
-    
-    try:
-        is_ready = client.is_ready()
-        print(f"✓ Weaviate on valmis: {is_ready}")
-    except Exception as e:
-        print(f"❌ Weaviate ühenduse viga: {e}")
-        client.close()
-        return
+def clean_nul_bytes(value):
+    """Eemalda NULL-baidid kõikidest väärtustest"""
+    if isinstance(value, str):
+        return value.replace("\x00", "").replace("\x01", "")
+    elif isinstance(value, dict):
+        return {k: clean_nul_bytes(v) for k, v in value.items()}
+    elif isinstance(value, list):
+        return [clean_nul_bytes(item) for item in value]
+    return value
 
+def fetch_weaviate_articles() -> List[Dict[str, Any]]:
+    """Loeb kõik ScientificArticle objektid Weaviate'st koos metaandmetega."""
     print(f"Fetching articles from Weaviate (klass: {WEAVIATE_CLASS})...")
     
+    client = get_weaviate_client()
     try:
-        # Weaviate v4: fetch_objects ilma filtrita
         collection = client.collections.get(WEAVIATE_CLASS)
         
-        # Fetch all objects - kasuta suurt limitti
         results = collection.query.fetch_objects(
             limit=10000,
-            return_properties=["source_file", "title"]
+            return_properties=[
+                "source_file",
+                "title",
+                "doi",
+                "journal",
+                "year",
+                "authors",
+                "key_concepts",
+                "methods_used",
+                "summary_et",
+                "abstract_en",
+                "transport_context",
+                "relevance_score",
+                "processing_date",
+                "file_hash",
+            ],
         )
         
         print(f"✓ Leitud {len(results.objects)} artiklit Weaviate'st")
-    except Exception as e:
-        print(f"❌ Weaviate päringu viga: {e}")
-        import traceback
-        traceback.print_exc()
+        
+        articles: List[Dict[str, Any]] = []
+        for obj in results.objects:
+            props = obj.properties or {}
+            
+            source_file = props.get("source_file", "")
+            source_file = source_file.strip()
+            
+            # Normalize path: eemalda ./data/pdfs/ ja data/pdfs/
+            if source_file.startswith("./data/pdfs/"):
+                filename = source_file.replace("./data/pdfs/", "")
+            elif source_file.startswith("data/pdfs/"):
+                filename = source_file.replace("data/pdfs/", "")
+            else:
+                filename = Path(source_file).name
+            
+            if not source_file:
+                continue
+            
+            article = {
+                "weaviate_id": str(obj.uuid),
+                "source_file": source_file,
+                "filename": filename,
+                "title": props.get("title"),
+                "doi": props.get("doi"),
+                "journal": props.get("journal"),
+                "year": props.get("year"),
+                "authors": props.get("authors") or [],
+                "key_concepts": props.get("key_concepts") or [],
+                "methods_used": props.get("methods_used") or [],
+                "summary_et": props.get("summary_et"),
+                "abstract_en": props.get("abstract_en"),
+                "transport_context": props.get("transport_context"),
+                "relevance_score": props.get("relevance_score"),
+                "processing_date": props.get("processing_date"),
+                "file_hash": clean_nul_bytes(str(props.get("file_hash") if props.get("file_hash") else "")),
+            }
+            
+            articles.append(article)
+        
+        return articles
+    
+    finally:
         client.close()
-        return
 
+def sync_weaviate_to_postgres():
+    """
+    Sünkib Weaviate ScientificArticle metaandmed raw_documents tabeliga.
+    MATCHING STRATEEGIA (PRIORITEET JÄRJEKORRAS):
+    1. Filehash match (kõige usaldusväärne)
+    2. Filename match (fallback, kuid võib olla false positive)
+    - match: raw_documents.filename == Path(source_file).name
+    - uuendab: weaviate_article_id, title, doi, journal, year, authors, key_concepts,
+               methods_used, summary_et, abstract_en, transport_context, relevance_score, processing_date
+    """
+    print("Connecting to PostgreSQL...")
     conn = get_db_conn()
     cur = conn.cursor()
-
+    
+    # Uus osa
+    print("Loading PostgreSQL hash index...")
+    cur.execute("SELECT id, filename, file_hash FROM raw_documents WHERE file_hash IS NOT NULL")
+    postgres_by_hash = {}
+    for row in cur.fetchall():
+        doc_id, filename, file_hash = row
+        if file_hash:
+            postgres_by_hash[file_hash] = (doc_id, filename)
+            #print(f"file_hash:  {file_hash}")
+    print(f"   Loaded {len(postgres_by_hash)} documents with hashes\n")
+        
+    synced = 0
+    not_found_hash = 0
+    not_found_filename = 0
+    errors = 0
+    
+    stats = {
+        "hash_matches": 0,
+        "filename_matches": 0,
+        "filename_fallback": 0,
+    }
+    # Uue osa lõpp
+  
+    articles = fetch_weaviate_articles()
+    #print(f"Alustan sünkroniseerimist {len(articles)} artikliga...")
+    
     synced = 0
     not_found = 0
     errors = 0
-
-    for obj in results.objects:
+    
+    for art in articles:
         try:
-            props = obj.properties
-            source_file = props.get("source_file")
-            title = props.get("title", "N/A")
-            weaviate_id = obj.uuid
-
-            if not source_file or not weaviate_id:
-                print(f"⚠️ Puudub source_file või id: {title}")
-                not_found += 1
-                continue
+            filename = art["filename"]
+            weaviate_id = art["weaviate_id"]
+            source_file = art["source_file"]
+            #print(f"Testimine:  {filename}  -  {weaviate_uuid} -  {source_file}")
+            weaviate_hash = art.get("file_hash", "")
+            #print(f"Testimine weaviate_hash:  {weaviate_hash},  weaviate_id:  {weaviate_id} ")
 
-            weaviate_uuid = str(weaviate_id)
-            filename = Path(source_file).name
-
-            cur.execute(
-                "SELECT id FROM raw_documents WHERE filename = %s",
-                (filename,)
-            )
-            row = cur.fetchone()
-
-            if row:
-                raw_doc_id = row[0]
+            raw_doc_id = None
+            match_strategy = None
+            
+            # STRATEEGIA 1: Filehash match (esmane)
+            if weaviate_hash:
+                result = postgres_by_hash.get(weaviate_hash)
+                if result:
+                    raw_doc_id, pg_filename = result
+                    match_strategy = "HASH"
+                    stats["hash_matches"] += 1
+                    print(f"✅ [HASH] {filename}")
+                else:
+                    not_found_hash += 1
+            
+            # STRATEEGIA 2: Filename match (fallback)
+            if not raw_doc_id:
                 cur.execute(
-                    "UPDATE raw_documents SET weaviate_article_id = %s WHERE id = %s",
-                    (weaviate_uuid, raw_doc_id)
+                    "SELECT id FROM raw_documents WHERE filename = %s",
+                    (filename,)
                 )
-                synced += 1
-                print(f"✓ {title[:50]}... (pg_id={raw_doc_id})")
+                row = cur.fetchone()
+                if row and len(row) > 0:
+                    raw_doc_id = str(row[0])
+                    #print(f"Testimine raw_doc_id:  {raw_doc_id}")
+                    match_strategy = "FILENAME"
+                    stats["filename_matches"] += 1
+                    
+                    if not weaviate_hash:
+                        stats["filename_fallback"] += 1
+                        print(f"⚠️  [FALLBACK] {filename} (filehash puudub)")
+                    else:
+                        print(f"⚠️  [FALLBACK] {filename} (filehash: {weaviate_hash[:16]}...)")                    
+                elif row:
+                    print(f"⚠️  Empty tuple returned for {filename}")
+                    not_found_filename += 1
+                    continue
+                else:
+                    not_found_filename += 1
+                    print(f"❌ [NOT FOUND] {filename}")
+                    continue
+                        
+            # UPDATE raw_documents
+            title = clean_nul_bytes(art.get("title", ""))
+            doi = clean_nul_bytes(art.get("doi", ""))
+            journal = clean_nul_bytes(art.get("journal", ""))
+            year = clean_nul_bytes(art.get("year", ""))
+            #transport_context = clean_nul_bytes(art.get("transport_context", ""))
+            relevance_score = clean_nul_bytes(art.get("relevance_score", ""))
+            #processing_date = clean_nul_bytes(art.get("processing_date", ""))
+            
+            summary_et = clean_nul_bytes(art.get("summary_et", ""))
+            abstract_en = clean_nul_bytes(art.get("abstract_en", ""))
+            authors = clean_nul_bytes(art.get("authors", ""))
+            key_concepts = clean_nul_bytes(art.get("key_concepts", ""))
+            methods_used = clean_nul_bytes(art.get("methods_used", ""))
+            
+            # Transport context JSON
+            transport_context_json = None
+            if art.get("transport_context"):
+                try:
+                    if isinstance(art["transport_context"], str):
+                        transport_context_json = json.loads(art["transport_context"])
+                    else:
+                        transport_context_json = art["transport_context"]
+                except (json.JSONDecodeError, TypeError):
+                    transport_context_json = None
+                    
+            if transport_context_json:
+                transport_context = json.dumps(transport_context_json) 
             else:
-                not_found += 1
-                print(f"⚠️ Ei leitud pgvector'ist: {filename}")
+                transport_context = None
+            
+            # Processing date
+            processing_date = None
+            if art.get("processing_date"):
+                try:
+                    if isinstance(art["processing_date"], str):
+                        processing_date = datetime.fromisoformat(art["processing_date"])
+                    else:
+                        processing_date = art["processing_date"]
+                except (ValueError, TypeError):
+                    processing_date = None          
+            
+            #print(f"1.weaviate_id {weaviate_id}")
+            #print(f"2.title {title}")
+            #print(f"3.doi {doi}")
+            #print(f"4.journal {journal}")
+            #print(f"5.year {year}")
+            #print(f"6.authors {authors}")
+            #print(f"7.key_concepts {key_concepts}")
+            #print(f"8.methods_used {methods_used}")
+            #print(f"9.summary_et {summary_et}")
+            #print(f"10.transport_context {transport_context_json}")
+            #print(f"11.relevance_score {relevance_score}")
+            #print(f"12.processing_date {processing_date}")
+            
+            # Uuenda väljad - NULL handling on õige nüüd
+            cur.execute(
+                """
+                UPDATE raw_documents
+                SET
+                    weaviate_article_id = %s,
+                    title = COALESCE(%s, title),
+                    doi = COALESCE(%s, doi),
+                    journal = COALESCE(%s, journal),
+                    year = COALESCE(%s, year),
+                    authors = COALESCE(%s, authors),
+                    key_concepts = COALESCE(%s, key_concepts),
+                    methods_used = COALESCE(%s, methods_used),
+                    summary_et = COALESCE(%s, summary_et),
+                    abstract_en = COALESCE(%s, abstract_en),
+                    transport_context = COALESCE(%s::jsonb, transport_context),
+                    relevance_score = COALESCE(%s, relevance_score),
+                    processing_date = COALESCE(%s, processing_date)
+                WHERE id = %s
+                """,
+                (
+                    weaviate_id,
+                    title,
+                    title,
+                    art.get("journal"),
+                    art.get("year"),
+                    authors,
+                    key_concepts,
+                    methods_used,
+                    summary_et,
+                    abstract_en,
+                    transport_context,
+                    art.get("relevance_score"),
+                    processing_date,
+                    raw_doc_id
+                ),
+            )
+            
+            synced += 1
+            print(f"✓ {art['title'][:60]}... (filename={filename}, pg_id={raw_doc_id})")
+        
         except Exception as e:
             errors += 1
-            print(f"❌ Viga objektiga: {e}")
-
+            print(f"❌ Viga {art.get('title', 'N/A')} (file={art.get('filename')}): {e}")
+            # ✅ OLULINE: Rollback selle vea jaoks!
+            conn.rollback()
+            # Ühenda uuesti
+            cur.close()
+            conn.close()
+            conn = get_db_conn()
+            cur = conn.cursor()
+    
     conn.commit()
     cur.close()
     conn.close()
-    client.close()
-
+    
     print(f"\n✓ Kokkuvõte: {synced} sünkroniseeritud, {not_found} ei leitud, {errors} viga")
 
 if __name__ == "__main__":
     sync_weaviate_to_postgres()
+    # Lihtne debug: prindi olemasolevad Weaviate artiklid
+    articles = fetch_weaviate_articles()
+    print(f"Leitud {len(articles)} artiklit Weaviate'st")
+    #for art in articles:
+    #    print(
+    #        f"WeaviateID={art['weaviate_id']} "
+    #        f"filename={art['filename']} "
+    #        f"doi={art.get('doi')} "
+    #        f"title={art.get('title')[:80] if art.get('title') else ''}"
+    #    )