il y a 3 mois · d4c68f1e68
--- a/.gitignore
+++ b/.gitignore
@@ -58,3 +58,11 @@ docs/_build/
 
				 # PyBuilder
			
 
				 target/
			
 
				 
			
 
				+# Ardo
			
 
				+testimiseks/
			
 
				+data/chunks
			
 
				+data/extracted
			
 
				+data/processed
			
 
				+data/raw_pdfs
			
 
				+archive/
			
 
				+output/
			
--- a/LOEMIND.md
+++ b/LOEMIND.md
@@ -0,0 +1,87 @@
 
				+
			
 
				+
			
 
				+# Aktiveeri venv
			
 
				+source ~/venvs/pdf-env/bin/activate
			
 
				+cd ~/rag-demo/pdf-pipeline
			
 
				+export $(grep -v '^#' .env | xargs -d '\n')
			
 
				+
			
 
				+
			
 
				+python -m src.db_schema  # uuesti skeemiga
			
 
				+# Vektor välja muudatus
			
 
				+./scripts/db_vector_välja_muutmine.sh
			
 
				+
			
 
				+python -m src.extract_pdf  # parandatud ekstraheerimisega
			
 
				+python -m src.check_status  # parema ülevaatega
			
 
				+
			
 
				+python -m src.find_duplicates  # duplikaatide kontroll
			
 
				+python -m src.clean_and_normalize
			
 
				+
			
 
				+python -m src.create_chunks
			
 
				+python -m src.embed_chunks
			
 
				+
			
 
				+# Testimine
			
 
				+docker exec -it postgres_postgis psql -U osm -d pdf_research -h localhost -c "SELECT COUNT(*) FROM processed_documents;"
			
 
				+docker exec -it postgres_postgis psql -U osm -d pdf_research -h localhost -c "
			
 
				+  SELECT id, raw_doc_id, page, LENGTH(content_text) AS len, has_table
			
 
				+  FROM processed_documents
			
 
				+  ORDER BY id
			
 
				+  LIMIT 20;
			
 
				+"
			
 
				+
			
 
				+# Weaviate baasiga sünkroniseerimine
			
 
				+python -m src.sync_weaviate
			
 
				+
			
 
				+python -m src.query_hybrid
			
 
				+python -m src.generate_answer
			
 
				+
			
 
				+# API käivitamine
			
 
				+python -m src.rag_api
			
 
				+
			
 
				+## ==============================================================
			
 
				+## API päringute näidised
			
 
				+### 1. **Interactive Docs** (Swagger UI)
			
 
				+Ava brauseris:
			
 
				+```
			
 
				+http://localhost:8071/docs
			
 
				+```
			
 
				+### 2. **cURL käsuga**
			
 
				+```bash
			
 
				+# Lihtsustatud otsing
			
 
				+curl "http://localhost:8000/search-simple?q=young%20driver%20risk"
			
 
				+# POST requestiga (täielik kontroll)
			
 
				+curl -X POST "http://localhost:8071/search" \
			
 
				+  -H "Content-Type: application/json" \
			
 
				+  -d '{
			
 
				+    "query": "young driver accident risk",
			
 
				+    "top_articles": 10,
			
 
				+    "top_chunks": 20,
			
 
				+    "temperature": 0.6,
			
 
				+    "max_tokens": 1500
			
 
				+  }' | jq '.'
			
 
				+```
			
 
				+### 3. **Python requestiga**
			
 
				+
			
 
				+```python
			
 
				+import requests
			
 
				+import json
			
 
				+
			
 
				+# POST päring
			
 
				+response = requests.post(
			
 
				+    "http://localhost:8071/search",
			
 
				+    json={
			
 
				+        "query": "traffic flow prediction",
			
 
				+        "top_articles": 10,
			
 
				+        "top_chunks": 20,
			
 
				+        "temperature": 0.6,
			
 
				+        "max_tokens": 1500
			
 
				+    }
			
 
				+)
			
 
				+
			
 
				+result = response.json()
			
 
				+print("Küsimus:", result["query"])
			
 
				+print("Vastus:", result["answer"])
			
 
				+print("Allikad:", result["articles"])
			
 
				+```
			
 
				+
			
 
				+
			
 
				+
			
--- a/config/sql/kasutaja_ja_skeemi_loomine.sql
+++ b/config/sql/kasutaja_ja_skeemi_loomine.sql
@@ -0,0 +1,13 @@
 
				+-- 1. Loo adminkasutaja 'osm' parooliga 'osm'
			
 
				+-- NB! SUPERUSER annab täielikud õigused; vajadusel kohanda õiguseid kitsamaks
			
 
				+CREATE ROLE osm
			
 
				+  WITH
			
 
				+    LOGIN
			
 
				+    SUPERUSER
			
 
				+    CREATEDB
			
 
				+    CREATEROLE
			
 
				+    PASSWORD 'osm';
			
 
				+
			
 
				+-- 2. Loo database pdf_research
			
 
				+CREATE DATABASE pdf_research
			
 
				+    WITH OWNER = osm;
			
--- a/migrate_weaviate.py
+++ b/migrate_weaviate.py
@@ -0,0 +1,130 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Weaviate kollektsiooni migratsioon (remote -> local)
			
 
				+"""
			
 
				+import weaviate
			
 
				+import json
			
 
				+from typing import Optional
			
 
				+
			
 
				+# Ühendused
			
 
				+REMOTE_WEAVIATE_URL = "http://hetzner:9020"
			
 
				+LOCAL_WEAVIATE_URL = "http://localhost:8080"
			
 
				+COLLECTION_NAME = "ScientificArticles"
			
 
				+BATCH_SIZE = 100
			
 
				+
			
 
				+def connect_weaviate(url: str):
			
 
				+    """Ühenda Weaviate'ga"""
			
 
				+    client = weaviate.Client(url)
			
 
				+    if not client.is_ready():
			
 
				+        raise ConnectionError(f"Weaviate pole käimas: {url}")
			
 
				+    return client
			
 
				+
			
 
				+def get_schema(client, collection_name: str):
			
 
				+    """Tõmba kollektsiooni schema"""
			
 
				+    schema = client.schema.get()
			
 
				+    for cls in schema.get("classes", []):
			
 
				+        if cls["class"] == collection_name:
			
 
				+            return cls
			
 
				+    raise ValueError(f"Kollektsiooni {collection_name} pole olemas")
			
 
				+
			
 
				+def export_collection(client, collection_name: str):
			
 
				+    """Ekspordi kõik objektid Remote Weaviate'st"""
			
 
				+    print(f"📤 Eksporditakse {collection_name} remote'st ({REMOTE_WEAVIATE_URL})...")
			
 
				+    
			
 
				+    where_filter = {
			
 
				+        "path": ["__typename"],
			
 
				+        "operator": "NotEqual",
			
 
				+        "valueString": ""
			
 
				+    }
			
 
				+    
			
 
				+    objects = []
			
 
				+    after = None
			
 
				+    
			
 
				+    while True:
			
 
				+        response = client.data_object.get(
			
 
				+            collection_name=collection_name,
			
 
				+            limit=BATCH_SIZE,
			
 
				+            after=after,
			
 
				+            where=where_filter
			
 
				+        )
			
 
				+        
			
 
				+        batch = response.get("objects", [])
			
 
				+        if not batch:
			
 
				+            break
			
 
				+        
			
 
				+        objects.extend(batch)
			
 
				+        print(f"  ✓ Eksportitud {len(objects)} objekti...")
			
 
				+        
			
 
				+        # Jätka järgmiselt batshi
			
 
				+        if len(batch) < BATCH_SIZE:
			
 
				+            break
			
 
				+        after = batch[-1]["id"]
			
 
				+    
			
 
				+    print(f"✅ Kokku eksportitud: {len(objects)} objekti")
			
 
				+    return objects
			
 
				+
			
 
				+def import_collection(client, collection_name: str, objects: list):
			
 
				+    """Importi objektid lokaalse Weaviate'sse"""
			
 
				+    print(f"📥 Importitakse {collection_name} lokaalse Weaviate'sse...")
			
 
				+    
			
 
				+    success_count = 0
			
 
				+    error_count = 0
			
 
				+    
			
 
				+    for i, obj in enumerate(objects, 1):
			
 
				+        try:
			
 
				+            # Eemalda ID, et Weaviate saaks luua uue
			
 
				+            obj_data = obj.copy()
			
 
				+            obj_id = obj_data.pop("id", None)
			
 
				+            
			
 
				+            client.data_object.create(
			
 
				+                data_object=obj_data.get("properties", {}),
			
 
				+                class_name=collection_name,
			
 
				+                uuid=obj_id  # Kasuta sama ID
			
 
				+            )
			
 
				+            success_count += 1
			
 
				+            
			
 
				+            if i % 50 == 0:
			
 
				+                print(f"  ✓ Importitud {i}/{len(objects)} objekti...")
			
 
				+        except Exception as e:
			
 
				+            error_count += 1
			
 
				+            print(f"  ❌ Objekt {obj.get('id')}: {str(e)}")
			
 
				+    
			
 
				+    print(f"✅ Importimine valmis: {success_count} edu, {error_count} viga")
			
 
				+    return success_count
			
 
				+
			
 
				+def main():
			
 
				+    try:
			
 
				+        # Ühenda
			
 
				+        print("🔗 Ühendatakse remote Weaviate'ga...")
			
 
				+        remote_client = connect_weaviate(REMOTE_WEAVIATE_URL)
			
 
				+        
			
 
				+        print("🔗 Ühendatakse lokaalsesse Weaviate'ga...")
			
 
				+        local_client = connect_weaviate(LOCAL_WEAVIATE_URL)
			
 
				+        
			
 
				+        # Kontrolli kollektsiooni
			
 
				+        print(f"📋 Kontrollitakse kollektsiooni {COLLECTION_NAME}...")
			
 
				+        schema = get_schema(remote_client, COLLECTION_NAME)
			
 
				+        print(f"  ✓ Leitud: {schema['class']}")
			
 
				+        
			
 
				+        # Loo kohalik kolleektsioon sama schemaga
			
 
				+        try:
			
 
				+            local_client.schema.get()
			
 
				+            print("⚠️  Lokaalne kolleektsioon võib juba eksisteerida")
			
 
				+        except:
			
 
				+            print(f"📝 Luuakse lokaalne kolleektsioon {COLLECTION_NAME}...")
			
 
				+            local_client.schema.create_class(schema)
			
 
				+        
			
 
				+        # Ekspordi + Importi
			
 
				+        objects = export_collection(remote_client, COLLECTION_NAME)
			
 
				+        import_collection(local_client, COLLECTION_NAME, objects)
			
 
				+        
			
 
				+        print("\n✨ Migratsioon valmis!")
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        print(f"❌ Viga: {str(e)}")
			
 
				+        return 1
			
 
				+    
			
 
				+    return 0
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    exit(main())
			
--- a/migrate_weaviate_rest.py
+++ b/migrate_weaviate_rest.py
@@ -0,0 +1,220 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Weaviate migratsioon REST API abil - ainult ScientificArticle
			
 
				+Eemalda vectorizer konfig, et kohalik Weaviate saaks skeem luua
			
 
				+"""
			
 
				+import requests
			
 
				+import json
			
 
				+import copy
			
 
				+
			
 
				+REMOTE_URL = "http://hetzner:9020"
			
 
				+LOCAL_URL = "http://localhost:8080"
			
 
				+COLLECTION_NAME = "ScientificArticle"
			
 
				+
			
 
				+def clean_schema(schema: dict):
			
 
				+    """Eemalda vectorizer/module references"""
			
 
				+    cleaned = copy.deepcopy(schema)
			
 
				+    
			
 
				+    # Eemalda vectorizer konfid
			
 
				+    cleaned.pop("vectorizer", None)
			
 
				+    cleaned.pop("vectorizerConfig", None)
			
 
				+    cleaned.pop("vectorIndexConfig", None)
			
 
				+    cleaned.pop("vector_config", None)
			
 
				+    cleaned.pop("vectorIndexType", None)
			
 
				+    
			
 
				+    # Eemalda propertytest vectorizer infod
			
 
				+    for prop in cleaned.get("properties", []):
			
 
				+        prop.pop("vectorizer", None)
			
 
				+        prop.pop("vectorizerConfig", None)
			
 
				+        prop.pop("vectorizer_configs", None)
			
 
				+    
			
 
				+    return cleaned
			
 
				+
			
 
				+def get_collection_schema(url: str, collection: str):
			
 
				+    """Tõmba kollektsiooni schema"""
			
 
				+    print(f"📋 Schema: {collection}...")
			
 
				+    try:
			
 
				+        resp = requests.get(f"{url}/v1/schema", timeout=10)
			
 
				+        schema = resp.json()
			
 
				+        
			
 
				+        for cls in schema.get("classes", []):
			
 
				+            if cls["class"] == collection:
			
 
				+                print(f"  ✓ Schema leitud")
			
 
				+                return cls
			
 
				+        return None
			
 
				+    except Exception as e:
			
 
				+        print(f"  ❌ Viga: {e}")
			
 
				+        return None
			
 
				+
			
 
				+def get_objects(url: str, collection: str):
			
 
				+    """Tõmba kõik objektid REST API abil"""
			
 
				+    print(f"\n📤 Eksporditakse {collection}...")
			
 
				+    
			
 
				+    objects = []
			
 
				+    offset = 0
			
 
				+    limit = 100
			
 
				+    
			
 
				+    while True:
			
 
				+        try:
			
 
				+            resp = requests.get(
			
 
				+                f"{url}/v1/objects",
			
 
				+                params={
			
 
				+                    "class": collection,
			
 
				+                    "limit": limit,
			
 
				+                    "offset": offset
			
 
				+                },
			
 
				+                timeout=30
			
 
				+            )
			
 
				+            
			
 
				+            if resp.status_code != 200:
			
 
				+                print(f"  ⚠️  Viga: {resp.status_code}")
			
 
				+                break
			
 
				+            
			
 
				+            data = resp.json()
			
 
				+            items = data.get("objects", [])
			
 
				+            
			
 
				+            if not items:
			
 
				+                break
			
 
				+            
			
 
				+            objects.extend(items)
			
 
				+            offset += len(items)
			
 
				+            print(f"  ✓ Tõmmatud: {offset} objekti...")
			
 
				+            
			
 
				+            if len(items) < limit:
			
 
				+                break
			
 
				+        
			
 
				+        except Exception as e:
			
 
				+            print(f"  ❌ Viga: {e}")
			
 
				+            break
			
 
				+    
			
 
				+    print(f"✅ Eksportitud kokku: {len(objects)} objekti\n")
			
 
				+    return objects
			
 
				+
			
 
				+def create_collection_local(url: str, schema: dict):
			
 
				+    """Loo kolleektsioon lokaalse Weaviate'sse"""
			
 
				+    collection_name = schema["class"]
			
 
				+    print(f"📝 Kontrollitakse lokaalne kolleektsioon '{collection_name}'...")
			
 
				+    
			
 
				+    try:
			
 
				+        # Kontrolli kas juba eksisteerib
			
 
				+        resp = requests.get(f"{url}/v1/schema", timeout=10)
			
 
				+        existing = resp.json()
			
 
				+        
			
 
				+        for cls in existing.get("classes", []):
			
 
				+            if cls["class"] == collection_name:
			
 
				+                print(f"  ✓ Kolleektsioon '{collection_name}' juba eksisteerib\n")
			
 
				+                return True
			
 
				+        
			
 
				+        # Puhasta schema (eemalda vectorizer infod)
			
 
				+        clean = clean_schema(schema)
			
 
				+        
			
 
				+        # Loo uus
			
 
				+        print(f"  📝 Luuakse uus kolleektsioon (ilma vectorizer'ita)...")
			
 
				+        resp = requests.post(
			
 
				+            f"{url}/v1/schema",
			
 
				+            json=clean,
			
 
				+            timeout=10
			
 
				+        )
			
 
				+        
			
 
				+        if resp.status_code in [200, 201]:
			
 
				+            print(f"  ✓ Kolleektsioon '{collection_name}' loodud\n")
			
 
				+            return True
			
 
				+        else:
			
 
				+            print(f"  ❌ Viga: {resp.status_code} - {resp.text[:200]}\n")
			
 
				+            return False
			
 
				+    
			
 
				+    except Exception as e:
			
 
				+        print(f"  ❌ Viga: {e}\n")
			
 
				+        return False
			
 
				+
			
 
				+def import_objects_rest(url: str, collection: str, objects: list):
			
 
				+    """Importi objektid REST API abil"""
			
 
				+    print(f"📥 Importitakse {len(objects)} objekti...")
			
 
				+    
			
 
				+    success = 0
			
 
				+    errors = 0
			
 
				+    
			
 
				+    for i, obj in enumerate(objects, 1):
			
 
				+        try:
			
 
				+            obj_id = obj.get("id")
			
 
				+            vector = obj.get("vector")
			
 
				+            properties = obj.get("properties", {})
			
 
				+            
			
 
				+            payload = {
			
 
				+                "class": collection,
			
 
				+                "id": obj_id,
			
 
				+                "properties": properties
			
 
				+            }
			
 
				+            
			
 
				+            if vector:
			
 
				+                payload["vector"] = vector
			
 
				+            
			
 
				+            resp = requests.post(
			
 
				+                f"{url}/v1/objects",
			
 
				+                json=payload,
			
 
				+                timeout=10
			
 
				+            )
			
 
				+            
			
 
				+            if resp.status_code in [200, 201]:
			
 
				+                success += 1
			
 
				+            else:
			
 
				+                errors += 1
			
 
				+                if errors <= 5:
			
 
				+                    print(f"  ⚠️  Objekt {i}: {resp.status_code}")
			
 
				+            
			
 
				+            if i % 50 == 0:
			
 
				+                print(f"  ✓ Importitud {i}/{len(objects)}...")
			
 
				+        
			
 
				+        except Exception as e:
			
 
				+            errors += 1
			
 
				+            if errors <= 5:
			
 
				+                print(f"  ⚠️  Viga {i}: {e}")
			
 
				+    
			
 
				+    print(f"\n✅ Import valmis: {success} edu, {errors} viga\n")
			
 
				+    return success
			
 
				+
			
 
				+def main():
			
 
				+    print(f"\n{'='*60}")
			
 
				+    print(f"WEAVIATE MIGRATSIOON: {COLLECTION_NAME}")
			
 
				+    print(f"{'='*60}\n")
			
 
				+    
			
 
				+    print("🔗 Kontrollitakse ühendusi...")
			
 
				+    
			
 
				+    try:
			
 
				+        resp = requests.get(f"{REMOTE_URL}/v1/meta", timeout=5)
			
 
				+        print(f"  ✓ Remote ({REMOTE_URL}): {resp.json().get('version', 'OK')}")
			
 
				+    except Exception as e:
			
 
				+        print(f"  ❌ Remote viga: {e}")
			
 
				+        return 1
			
 
				+    
			
 
				+    try:
			
 
				+        resp = requests.get(f"{LOCAL_URL}/v1/meta", timeout=5)
			
 
				+        print(f"  ✓ Lokaalne ({LOCAL_URL}): {resp.json().get('version', 'OK')}\n")
			
 
				+    except Exception as e:
			
 
				+        print(f"  ❌ Lokaalne viga: {e}")
			
 
				+        return 1
			
 
				+    
			
 
				+    schema = get_collection_schema(REMOTE_URL, COLLECTION_NAME)
			
 
				+    if not schema:
			
 
				+        print(f"❌ Kollektsiooni '{COLLECTION_NAME}' remote'l pole\n")
			
 
				+        return 1
			
 
				+    
			
 
				+    if not create_collection_local(LOCAL_URL, schema):
			
 
				+        print(f"❌ Lokaalne kolleektsioon ei loodud\n")
			
 
				+        return 1
			
 
				+    
			
 
				+    objects = get_objects(REMOTE_URL, COLLECTION_NAME)
			
 
				+    
			
 
				+    if not objects:
			
 
				+        print(f"⚠️  Kollektsioonis '{COLLECTION_NAME}' pole objekte\n")
			
 
				+        return 0
			
 
				+    
			
 
				+    import_objects_rest(LOCAL_URL, COLLECTION_NAME, objects)
			
 
				+    
			
 
				+    print(f"{'='*60}")
			
 
				+    print("✨ Migratsioon valmis!")
			
 
				+    print(f"{'='*60}\n")
			
 
				+    return 0
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    exit(main())
			
--- a/scripts/db_vector_välja_muutmine.sh
+++ b/scripts/db_vector_välja_muutmine.sh
@@ -0,0 +1,33 @@
 
				+# Veeru laiuse muudatus
			
 
				+docker exec -it postgres_postgis bash -c "
			
 
				+  export PGPASSWORD='osm' &&
			
 
				+  psql -U osm -h localhost -d pdf_research -c \"
			
 
				+  ALTER TABLE chunks
			
 
				+      ALTER COLUMN embedding TYPE vector(384);
			
 
				+  \"
			
 
				+"
			
 
				+
			
 
				+docker exec -it postgres_postgis bash -c "
			
 
				+  export PGPASSWORD='osm' &&
			
 
				+  psql -U osm -h localhost -d pdf_research -c \"
			
 
				+  DROP INDEX IF EXISTS idx_chunks_embedding;
			
 
				+  CREATE INDEX idx_chunks_embedding ON chunks
			
 
				+  USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
			
 
				+  \"
			
 
				+"
			
 
				+
			
 
				+# Lisa raw_documents tabelisse weaviate_article_id veerg
			
 
				+docker exec -it postgres_postgis bash -c "
			
 
				+  export PGPASSWORD='osm' &&
			
 
				+  psql -U osm -h localhost -d pdf_research -c \"
			
 
				+ALTER TABLE raw_documents ADD COLUMN weaviate_article_id UUID;
			
 
				+CREATE INDEX idx_raw_weaviate_id ON raw_documents(weaviate_article_id);
			
 
				+  \"
			
 
				+"
			
 
				+# Kontrollli veeru olemasolu
			
 
				+docker exec -it postgres_postgis bash -c "
			
 
				+  export PGPASSWORD='osm' &&
			
 
				+  psql -U osm -h localhost -d pdf_research -c \"
			
 
				+  SELECT column_name, data_type FROM information_schema.columns WHERE table_name = 'raw_documents' ORDER BY ordinal_position;
			
 
				+  \"
			
 
				+"
			
--- a/src/check_status.py
+++ b/src/check_status.py
@@ -0,0 +1,87 @@
 
				+# file: src/check_status.py - PARANDUS
			
 
				+import os
			
 
				+import json
			
 
				+import csv
			
 
				+from pathlib import Path
			
 
				+from dotenv import load_dotenv
			
 
				+import pdfplumber
			
 
				+import psycopg2
			
 
				+import hashlib
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+DB_HOST = os.getenv("DB_HOST", "localhost")
			
 
				+DB_PORT = os.getenv("DB_PORT", "5432")
			
 
				+DB_NAME = os.getenv("DB_NAME", "pdf_research")
			
 
				+DB_USER = os.getenv("DB_USER", "pdf_user")
			
 
				+DB_PASSWORD = os.getenv("DB_PASSWORD")
			
 
				+
			
 
				+def check_status():
			
 
				+    conn = psycopg2.connect(
			
 
				+        host=DB_HOST,
			
 
				+        port=DB_PORT,
			
 
				+        database=DB_NAME,
			
 
				+        user=DB_USER,
			
 
				+        password=DB_PASSWORD,
			
 
				+    )
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    print("=" * 60)
			
 
				+    print("📊 EKSTRAKTSIOONI STAATUS")
			
 
				+    print("=" * 60)
			
 
				+
			
 
				+    # Raw dokumentid
			
 
				+    cur.execute("""
			
 
				+    SELECT 
			
 
				+        COUNT(*) as doc_count, 
			
 
				+        SUM(pages) as total_pages,
			
 
				+        SUM(file_size_bytes) as total_size_bytes
			
 
				+    FROM raw_documents
			
 
				+    """)
			
 
				+    doc_count, total_pages, total_size = cur.fetchone()
			
 
				+    total_size_mb = (total_size or 0) / (1024 * 1024)
			
 
				+    print(f"\n✓ Raw dokumentid:")
			
 
				+    print(f"  - Failid: {doc_count}")
			
 
				+    print(f"  - Lehti kokku: {total_pages}")
			
 
				+    print(f"  - Suurus: {total_size_mb:.2f} MB")
			
 
				+
			
 
				+    # Töödeldud osad
			
 
				+    cur.execute("""
			
 
				+    SELECT 
			
 
				+        COUNT(*) as total_parts,
			
 
				+        SUM(CASE WHEN has_table THEN 1 ELSE 0 END) as table_parts,
			
 
				+        SUM(CASE WHEN NOT has_table THEN 1 ELSE 0 END) as text_parts
			
 
				+    FROM processed_documents
			
 
				+    """)
			
 
				+    total_parts, table_parts, text_parts = cur.fetchone()
			
 
				+    print(f"\n✓ Töödeldud osad:")
			
 
				+    print(f"  - Kokku: {total_parts}")
			
 
				+    print(f"  - Tekstiosad: {text_parts}")
			
 
				+    print(f"  - Tabeliosad: {table_parts}")
			
 
				+
			
 
				+    # Dokumentide detailid
			
 
				+    print(f"\n✓ Dokumentide loend:")
			
 
				+    cur.execute("""
			
 
				+    SELECT 
			
 
				+        filename, 
			
 
				+        pages, 
			
 
				+        file_hash,
			
 
				+        extracted_at,
			
 
				+        (SELECT COUNT(*) FROM processed_documents WHERE raw_doc_id = raw_documents.id) as processed_parts
			
 
				+    FROM raw_documents 
			
 
				+    ORDER BY extracted_at DESC
			
 
				+    """)
			
 
				+    docs = cur.fetchall()
			
 
				+    for filename, pages, file_hash, extracted_at, proc_parts in docs:
			
 
				+        hash_short = file_hash[:8] if file_hash else "N/A"
			
 
				+        print(f"  - {filename}")
			
 
				+        print(f"    Hash: {hash_short}... | Lehti: {pages} | Töödeldud osad: {proc_parts}")
			
 
				+        print(f"    Ekstraktitud: {extracted_at}")
			
 
				+
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+    print("\n" + "=" * 60)
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    check_status()
			
--- a/src/chunk_utils.py
+++ b/src/chunk_utils.py
@@ -0,0 +1,58 @@
 
				+# file: src/chunk_utils.py
			
 
				+import re
			
 
				+from typing import List, Dict
			
 
				+
			
 
				+def split_into_sentences(text: str) -> List[str]:
			
 
				+    # väga lihtne lausejaotus
			
 
				+    parts = re.split(r'(?<=[.!?])\s+', text.strip())
			
 
				+    return [p.strip() for p in parts if p.strip()]
			
 
				+
			
 
				+def chunk_text(
			
 
				+    text: str,
			
 
				+    max_chars: int = 1500,
			
 
				+    overlap_chars: int = 300
			
 
				+) -> List[Dict]:
			
 
				+    """
			
 
				+    Recursive / semantiline chunking:
			
 
				+    - proovib hoida lausepiirid
			
 
				+    - loob chunkid ~max_chars, ülekate overlap_chars
			
 
				+    """
			
 
				+    if len(text) <= max_chars:
			
 
				+        return [{"text": text, "start": 0, "end": len(text)}]
			
 
				+
			
 
				+    sentences = split_into_sentences(text)
			
 
				+    chunks = []
			
 
				+    current = ""
			
 
				+    start_idx = 0
			
 
				+
			
 
				+    for sent in sentences:
			
 
				+        if not current:
			
 
				+            current = sent
			
 
				+            start_idx = text.find(sent, start_idx)
			
 
				+            continue
			
 
				+
			
 
				+        # Kui uus lause veel mahub
			
 
				+        if len(current) + 1 + len(sent) <= max_chars:
			
 
				+            current = current + " " + sent
			
 
				+        else:
			
 
				+            end_idx = start_idx + len(current)
			
 
				+            chunks.append({"text": current, "start": start_idx, "end": end_idx})
			
 
				+
			
 
				+            # Ülekate lõpust
			
 
				+            overlap_start = max(0, end_idx - overlap_chars)
			
 
				+            current = text[overlap_start:end_idx]
			
 
				+            start_idx = overlap_start
			
 
				+
			
 
				+            # Kui viimane chunk + uus lause on liiga pikk, alustame uuesti
			
 
				+            if len(current) + 1 + len(sent) > max_chars:
			
 
				+                chunks.append({"text": current, "start": start_idx, "end": start_idx + len(current)})
			
 
				+                current = sent
			
 
				+                start_idx = text.find(sent, end_idx)
			
 
				+            else:
			
 
				+                current = current + " " + sent
			
 
				+
			
 
				+    if current:
			
 
				+        end_idx = start_idx + len(current)
			
 
				+        chunks.append({"text": current, "start": start_idx, "end": end_idx})
			
 
				+
			
 
				+    return chunks
			
--- a/src/clean_and_normalize.py
+++ b/src/clean_and_normalize.py
@@ -0,0 +1,105 @@
 
				+# file: src/clean_and_normalize.py
			
 
				+import os
			
 
				+import json
			
 
				+from pathlib import Path
			
 
				+from dotenv import load_dotenv
			
 
				+import psycopg2
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+EXTRACTED_OUTPUT_DIR = Path(os.getenv("EXTRACTED_OUTPUT_DIR", "data/extracted"))
			
 
				+
			
 
				+DB_HOST = os.getenv("DB_HOST", "localhost")
			
 
				+DB_PORT = os.getenv("DB_PORT", "5432")
			
 
				+DB_NAME = os.getenv("DB_NAME", "pdf_research")
			
 
				+DB_USER = os.getenv("DB_USER", "pdf_user")
			
 
				+DB_PASSWORD = os.getenv("DB_PASSWORD")
			
 
				+
			
 
				+def get_db_conn():
			
 
				+    return psycopg2.connect(
			
 
				+        host=DB_HOST,
			
 
				+        port=DB_PORT,
			
 
				+        database=DB_NAME,
			
 
				+        user=DB_USER,
			
 
				+        password=DB_PASSWORD,
			
 
				+    )
			
 
				+
			
 
				+def clean_text_basic(text: str) -> str:
			
 
				+    # Väga lihtne puhastus: trimmime, asendame mitmekordsed tühikud ühega
			
 
				+    import re
			
 
				+    if not text:
			
 
				+        return ""
			
 
				+    text = text.replace("\r", "\n")
			
 
				+    text = text.replace("\u00a0", " ")  # non‑breaking space
			
 
				+    text = re.sub(r"[ \t]+", " ", text)
			
 
				+    text = re.sub(r"\n{3,}", "\n\n", text)
			
 
				+    return text.strip()
			
 
				+
			
 
				+def process_single_doc(raw_doc_id: int, filename: str):
			
 
				+    pdf_stem = Path(filename).stem
			
 
				+    json_path = EXTRACTED_OUTPUT_DIR / pdf_stem / "pages.json"
			
 
				+
			
 
				+    if not json_path.exists():
			
 
				+        print(f"❌ pages.json puudub: {json_path}")
			
 
				+        return
			
 
				+
			
 
				+    pages_data = json.loads(json_path.read_text())
			
 
				+
			
 
				+    conn = get_db_conn()
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    inserted = 0
			
 
				+
			
 
				+    for page in pages_data:
			
 
				+        page_num = page["page"]
			
 
				+        raw_text = page.get("text") or ""
			
 
				+        text = clean_text_basic(raw_text)
			
 
				+
			
 
				+        if not text:
			
 
				+            print(f"  ⚠️ Leht {page_num}: tühi tekst, jätan vahele")
			
 
				+            continue
			
 
				+
			
 
				+        cur.execute("""
			
 
				+            INSERT INTO processed_documents
			
 
				+                (raw_doc_id, page, section, content_text, has_table, table_data, bbox)
			
 
				+            VALUES
			
 
				+                (%s, %s, %s, %s, %s, %s, %s)
			
 
				+        """, (
			
 
				+            raw_doc_id,
			
 
				+            page_num,
			
 
				+            f"page_{page_num}",
			
 
				+            text,
			
 
				+            False,
			
 
				+            None,
			
 
				+            None,
			
 
				+        ))
			
 
				+        inserted += 1
			
 
				+
			
 
				+    conn.commit()
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+    print(f"✓ {filename}: lisatud {inserted} rida processed_documents tabelisse")
			
 
				+
			
 
				+def process_all_docs():
			
 
				+    conn = get_db_conn()
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    # Võtame kõik raw dokumendid
			
 
				+    cur.execute("SELECT id, filename FROM raw_documents ORDER BY id")
			
 
				+    docs = cur.fetchall()
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+    if not docs:
			
 
				+        print("❌ raw_documents on tühi, esmalt käivita extract_pdf.py")
			
 
				+        return
			
 
				+
			
 
				+    for raw_doc_id, filename in docs:
			
 
				+        print(f"🔧 Töötlen: {filename} (raw_doc_id={raw_doc_id})")
			
 
				+        process_single_doc(raw_doc_id, filename)
			
 
				+
			
 
				+    print("✓ Kõik dokumendid töödeldud (lehe‑tasemel tekstina).")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    process_all_docs()
			
--- a/src/create_chunks.py
+++ b/src/create_chunks.py
@@ -0,0 +1,115 @@
 
				+# file: src/create_chunks.py
			
 
				+# file: src/create_chunks.py
			
 
				+import os
			
 
				+from dotenv import load_dotenv
			
 
				+import psycopg2
			
 
				+import json
			
 
				+from src.chunk_utils import chunk_text
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+DB_HOST = os.getenv("DB_HOST", "localhost")
			
 
				+DB_PORT = os.getenv("DB_PORT", "5432")
			
 
				+DB_NAME = os.getenv("DB_NAME", "pdf_research")
			
 
				+DB_USER = os.getenv("DB_USER", "pdf_user")
			
 
				+DB_PASSWORD = os.getenv("DB_PASSWORD")
			
 
				+
			
 
				+CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1500"))
			
 
				+CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "300"))
			
 
				+
			
 
				+def get_conn():
			
 
				+    return psycopg2.connect(
			
 
				+        host=DB_HOST,
			
 
				+        port=DB_PORT,
			
 
				+        database=DB_NAME,
			
 
				+        user=DB_USER,
			
 
				+        password=DB_PASSWORD,
			
 
				+    )
			
 
				+
			
 
				+def create_chunks_for_doc(raw_doc_id: int, filename: str):
			
 
				+    conn = get_conn()
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    # Võtame kõik processed_documents read sellest PDF-ist
			
 
				+    cur.execute("""
			
 
				+        SELECT id, page, content_text, has_table
			
 
				+        FROM processed_documents
			
 
				+        WHERE raw_doc_id = %s
			
 
				+        ORDER BY page, id
			
 
				+    """, (raw_doc_id,))
			
 
				+    rows = cur.fetchall()
			
 
				+
			
 
				+    print(f"  -> processed_documents ridu: {len(rows)}")
			
 
				+
			
 
				+    chunk_index = 0
			
 
				+    total_chunks = 0
			
 
				+
			
 
				+    for proc_id, page, content_text, has_table in rows:
			
 
				+        if not content_text or len(content_text.strip()) == 0:
			
 
				+            print(f"     (id={proc_id}, page={page}) tühi content_text, jätan vahele")
			
 
				+            continue
			
 
				+
			
 
				+        chunks = chunk_text(
			
 
				+            content_text,
			
 
				+            max_chars=CHUNK_SIZE,
			
 
				+            overlap_chars=CHUNK_OVERLAP,
			
 
				+        )
			
 
				+
			
 
				+        print(f"     (id={proc_id}, page={page}) -> {len(chunks)} chunk(i)")
			
 
				+
			
 
				+        for ch in chunks:
			
 
				+            chunk_index += 1
			
 
				+            meta = {
			
 
				+                "has_table": bool(has_table),
			
 
				+                "page": int(page),
			
 
				+            }
			
 
				+
			
 
				+            cur.execute("""
			
 
				+                INSERT INTO chunks
			
 
				+                (processed_doc_id, raw_doc_id, filename, chunk_index, page, text, num_tokens, metadata)
			
 
				+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
			
 
				+            """, (
			
 
				+                proc_id,
			
 
				+                raw_doc_id,
			
 
				+                filename,
			
 
				+                chunk_index,
			
 
				+                page,
			
 
				+                ch["text"],
			
 
				+                len(ch["text"].split()),
			
 
				+                json.dumps(meta),  # ← võtmeparandus
			
 
				+            ))
			
 
				+            total_chunks += 1
			
 
				+
			
 
				+    conn.commit()
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+    print(f"✓ {filename}: loodud {total_chunks} chunk(i), viimase indeks: {chunk_index}")
			
 
				+
			
 
				+def create_chunks_for_all():
			
 
				+    conn = get_conn()
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    cur.execute("""
			
 
				+        SELECT id, filename
			
 
				+        FROM raw_documents
			
 
				+        ORDER BY id
			
 
				+    """)
			
 
				+    docs = cur.fetchall()
			
 
				+
			
 
				+    for raw_doc_id, filename in docs:
			
 
				+        # Kontroll, kas juba on chunk’e
			
 
				+        cur.execute("SELECT COUNT(*) FROM chunks WHERE raw_doc_id = %s", (raw_doc_id,))
			
 
				+        count = cur.fetchone()[0]
			
 
				+        if count > 0:
			
 
				+            print(f"⏭️  {filename}: chunkid juba olemas ({count} rida). Vahele jätan.")
			
 
				+            continue
			
 
				+
			
 
				+        print(f"🧩 Loon chunkid: {filename}")
			
 
				+        create_chunks_for_doc(raw_doc_id, filename)
			
 
				+
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    create_chunks_for_all()
			
--- a/src/db_schema.py
+++ b/src/db_schema.py
@@ -0,0 +1,103 @@
 
				+# file: src/db_schema.py - VECTOR toega
			
 
				+
			
 
				+import os
			
 
				+from dotenv import load_dotenv
			
 
				+import psycopg2
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+DB_HOST = os.getenv("DB_HOST", "localhost")
			
 
				+DB_PORT = os.getenv("DB_PORT", "5432")
			
 
				+DB_NAME = os.getenv("DB_NAME", "pdf_research")
			
 
				+DB_USER = os.getenv("DB_USER", "pdf_user")
			
 
				+DB_PASSWORD = os.getenv("DB_PASSWORD")
			
 
				+
			
 
				+def create_schema():
			
 
				+    conn = psycopg2.connect(
			
 
				+        host=DB_HOST,
			
 
				+        port=DB_PORT,
			
 
				+        database=DB_NAME,
			
 
				+        user=DB_USER,
			
 
				+        password=DB_PASSWORD,
			
 
				+    )
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    # Pgvector laiendus
			
 
				+    try:
			
 
				+        cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
			
 
				+        print("✓ pgvector laiendus lubatud")
			
 
				+    except Exception as e:
			
 
				+        print(f"⚠️  pgvector laiendus: {e}")
			
 
				+
			
 
				+    # Raw dokumentid
			
 
				+    cur.execute("""
			
 
				+    CREATE TABLE IF NOT EXISTS raw_documents (
			
 
				+        id SERIAL PRIMARY KEY,
			
 
				+        filename VARCHAR(255) NOT NULL,
			
 
				+        file_hash VARCHAR(32),
			
 
				+        pages INT,
			
 
				+        file_size_bytes INT,
			
 
				+        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
			
 
				+        extracted_at TIMESTAMP,
			
 
				+        extracted_json_path VARCHAR(255),
			
 
				+        extracted_tables_dir VARCHAR(255),
			
 
				+        UNIQUE(filename, file_hash)
			
 
				+    )
			
 
				+    """)
			
 
				+
			
 
				+    # Töödeldud dokumendid
			
 
				+    cur.execute("""
			
 
				+    CREATE TABLE IF NOT EXISTS processed_documents (
			
 
				+        id SERIAL PRIMARY KEY,
			
 
				+        raw_doc_id INT REFERENCES raw_documents(id) ON DELETE CASCADE,
			
 
				+        page INT NOT NULL,
			
 
				+        section VARCHAR(100),
			
 
				+        content_text TEXT,
			
 
				+        has_table BOOLEAN DEFAULT FALSE,
			
 
				+        table_data JSONB,
			
 
				+        bbox JSONB,
			
 
				+        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
			
 
				+    )
			
 
				+    """)
			
 
				+
			
 
				+    # Chunkid (VECTOR)
			
 
				+    cur.execute("""
			
 
				+    CREATE TABLE IF NOT EXISTS chunks (
			
 
				+        id SERIAL PRIMARY KEY,
			
 
				+        processed_doc_id INT REFERENCES processed_documents(id) ON DELETE CASCADE,
			
 
				+        raw_doc_id INT REFERENCES raw_documents(id),
			
 
				+        filename VARCHAR(255),
			
 
				+        chunk_index INT,
			
 
				+        page INT,
			
 
				+        text TEXT,
			
 
				+        num_tokens INT,
			
 
				+        embedding VECTOR(1536),
			
 
				+        metadata JSONB,
			
 
				+        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
			
 
				+    )
			
 
				+    """)
			
 
				+
			
 
				+    # Indeksid
			
 
				+    cur.execute("CREATE INDEX IF NOT EXISTS idx_raw_filename ON raw_documents(filename)")
			
 
				+    cur.execute("CREATE INDEX IF NOT EXISTS idx_raw_filehash ON raw_documents(file_hash)")
			
 
				+    cur.execute("CREATE INDEX IF NOT EXISTS idx_proc_raw_id ON processed_documents(raw_doc_id)")
			
 
				+    cur.execute("CREATE INDEX IF NOT EXISTS idx_chunks_raw_id ON chunks(raw_doc_id)")
			
 
				+    cur.execute("CREATE INDEX IF NOT EXISTS idx_chunks_filename ON chunks(filename)")
			
 
				+    
			
 
				+    # Vector indeks (IVFFlat kiire otsing)
			
 
				+    try:
			
 
				+        cur.execute("""
			
 
				+        CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON chunks 
			
 
				+        USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100)
			
 
				+        """)
			
 
				+        print("✓ Vector indeks loodud")
			
 
				+    except Exception as e:
			
 
				+        print(f"⚠️  Vector indeksi loomine: {e}")
			
 
				+
			
 
				+    conn.commit()
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+    print("✓ Kõik skeemid loodud!")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    create_schema()
			
--- a/src/embed_chunks.py
+++ b/src/embed_chunks.py
@@ -0,0 +1,75 @@
 
				+# file: src/embed_chunks.py
			
 
				+# file: src/embed_chunks.py
			
 
				+import os
			
 
				+from dotenv import load_dotenv
			
 
				+import psycopg2
			
 
				+from src.embed_utils import get_embedding
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+DB_HOST = os.getenv("DB_HOST", "localhost")
			
 
				+DB_PORT = os.getenv("DB_PORT", "5432")
			
 
				+DB_NAME = os.getenv("DB_NAME", "pdf_research")
			
 
				+DB_USER = os.getenv("DB_USER", "pdf_user")
			
 
				+DB_PASSWORD = os.getenv("DB_PASSWORD")
			
 
				+
			
 
				+BATCH_LIMIT = int(os.getenv("EMBED_BATCH_LIMIT", "500000"))
			
 
				+
			
 
				+def get_conn():
			
 
				+    return psycopg2.connect(
			
 
				+        host=DB_HOST,
			
 
				+        port=DB_PORT,
			
 
				+        database=DB_NAME,
			
 
				+        user=DB_USER,
			
 
				+        password=DB_PASSWORD,
			
 
				+    )
			
 
				+
			
 
				+def embed_missing_chunks(limit: int | None = None):
			
 
				+    if limit is None:
			
 
				+        limit = BATCH_LIMIT
			
 
				+
			
 
				+    conn = get_conn()
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    # Võtame chunkid, mille embedding on NULL
			
 
				+    cur.execute("""
			
 
				+        SELECT id, text
			
 
				+        FROM chunks
			
 
				+        WHERE embedding IS NULL
			
 
				+        ORDER BY id
			
 
				+        LIMIT %s
			
 
				+    """, (limit,))
			
 
				+    rows = cur.fetchall()
			
 
				+
			
 
				+    if not rows:
			
 
				+        print("✓ Kõik chunkid on juba embedded.")
			
 
				+        cur.close()
			
 
				+        conn.close()
			
 
				+        return
			
 
				+
			
 
				+    print(f"Embedding {len(rows)} chunk(i)...")
			
 
				+
			
 
				+    updated = 0
			
 
				+    for chunk_id, text in rows:
			
 
				+        text = (text or "").strip()
			
 
				+        if not text:
			
 
				+            print(f"  ⚠️ chunk id={chunk_id}: tühi tekst, jätan vahele")
			
 
				+            continue
			
 
				+
			
 
				+        emb = get_embedding(text)  # list[float]
			
 
				+
			
 
				+        # pgvector oskab Python listi otse võtta, kui laiendus on olemas [web:105][web:107][web:110]
			
 
				+        cur.execute(
			
 
				+            "UPDATE chunks SET embedding = %s WHERE id = %s",
			
 
				+            (emb, chunk_id),
			
 
				+        )
			
 
				+        updated += 1
			
 
				+
			
 
				+    conn.commit()
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+    print(f"✓ Uuendatud {updated} chunk(i) embeddingu veeruga.")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    embed_missing_chunks()
			
--- a/src/embed_utils.py
+++ b/src/embed_utils.py
@@ -0,0 +1,31 @@
 
				+# file: src/embed_utils.py
			
 
				+import os
			
 
				+from dotenv import load_dotenv
			
 
				+from sentence_transformers import SentenceTransformer  # [web:133][web:134][web:135][web:141]
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+EMBED_MODEL_NAME = os.getenv("EMBED_MODEL_NAME", "BAAI/bge-small-en-v1.5")
			
 
				+EMBED_DIM = int(os.getenv("EMBED_DIM", "384"))  # bge-small-en-v1.5 [web:134][web:135]
			
 
				+
			
 
				+_model: SentenceTransformer | None = None
			
 
				+
			
 
				+def get_model() -> SentenceTransformer:
			
 
				+    global _model
			
 
				+    if _model is None:
			
 
				+        print(f"Loading embedding model: {EMBED_MODEL_NAME}")
			
 
				+        _model = SentenceTransformer(EMBED_MODEL_NAME)
			
 
				+    return _model
			
 
				+
			
 
				+def get_embedding(text: str) -> list[float]:
			
 
				+    """
			
 
				+    Tagastab ühe tekstijupi embeddingu.
			
 
				+    BGE v1.5 ei vaja tingimata special instruction'it, võib kasutada otse. [web:133][web:135][web:140][web:146]
			
 
				+    """
			
 
				+    text = (text or "").strip()
			
 
				+    if not text:
			
 
				+        return []
			
 
				+
			
 
				+    model = get_model()
			
 
				+    emb = model.encode(text, normalize_embeddings=True)  # normalization sobib cosine jaoks [web:133][web:135][web:141]
			
 
				+    return emb.tolist()
			
--- a/src/extract_pdf.py
+++ b/src/extract_pdf.py
@@ -0,0 +1,194 @@
 
				+# file: src/extract_pdf.py
			
 
				+import os
			
 
				+import json
			
 
				+import csv
			
 
				+from pathlib import Path
			
 
				+from dotenv import load_dotenv
			
 
				+import pdfplumber
			
 
				+import psycopg2
			
 
				+import hashlib
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+PDF_INPUT_DIR = Path(os.getenv("PDF_INPUT_DIR", "data/raw_pdfs"))
			
 
				+EXTRACTED_OUTPUT_DIR = Path(os.getenv("EXTRACTED_OUTPUT_DIR", "data/extracted"))
			
 
				+
			
 
				+DB_HOST = os.getenv("DB_HOST", "localhost")
			
 
				+DB_PORT = os.getenv("DB_PORT", "5432")
			
 
				+DB_NAME = os.getenv("DB_NAME", "pdf_research")
			
 
				+DB_USER = os.getenv("DB_USER", "pdf_user")
			
 
				+DB_PASSWORD = os.getenv("DB_PASSWORD")
			
 
				+
			
 
				+def get_db_conn():
			
 
				+    return psycopg2.connect(
			
 
				+        host=DB_HOST,
			
 
				+        port=DB_PORT,
			
 
				+        database=DB_NAME,
			
 
				+        user=DB_USER,
			
 
				+        password=DB_PASSWORD,
			
 
				+    )
			
 
				+
			
 
				+def get_file_hash(file_path: Path) -> str:
			
 
				+    """Arvutab faili MD5 hash'i"""
			
 
				+    md5 = hashlib.md5()
			
 
				+    with open(file_path, "rb") as f:
			
 
				+        for chunk in iter(lambda: f.read(4096), b""):
			
 
				+            md5.update(chunk)
			
 
				+    return md5.hexdigest()
			
 
				+
			
 
				+def extract_single_pdf(pdf_path: Path):
			
 
				+    """
			
 
				+    Ekstraktib PDF-st:
			
 
				+    - lehekülgede tekstid JSON-ks (paigutuse metaandmetega)
			
 
				+    - tabelid CSV-ks (lehekülgeti)
			
 
				+    """
			
 
				+    pdf_name = pdf_path.stem
			
 
				+    output_subdir = EXTRACTED_OUTPUT_DIR / pdf_name
			
 
				+    output_subdir.mkdir(parents=True, exist_ok=True)
			
 
				+    
			
 
				+    pages_data = []
			
 
				+    tables_meta = []
			
 
				+    table_count = 0
			
 
				+    
			
 
				+    print(f"📖 Käsitlen: {pdf_path.name}")
			
 
				+    
			
 
				+    with pdfplumber.open(pdf_path) as pdf:
			
 
				+        for page_num, page in enumerate(pdf.pages, start=1):
			
 
				+            # Tekst
			
 
				+            text = page.extract_text() or ""
			
 
				+            
			
 
				+            # Sõnad paigutusega (asendab extract_text_dict)
			
 
				+            words = page.extract_words() or []
			
 
				+            
			
 
				+            # Tabelid
			
 
				+            tables = page.extract_tables()
			
 
				+            page_tables = []
			
 
				+            
			
 
				+            for table_idx, table in enumerate(tables, start=1):
			
 
				+                table_count += 1
			
 
				+                csv_file = output_subdir / f"page_{page_num}_table_{table_idx}.csv"
			
 
				+                
			
 
				+                with csv_file.open("w", newline="", encoding="utf-8") as f:
			
 
				+                    writer = csv.writer(f)
			
 
				+                    for row in table:
			
 
				+                        writer.writerow(row)
			
 
				+                
			
 
				+                # PARANDUS: Valige üks alljärgnevatest
			
 
				+                
			
 
				+                # VARIANT 1: Resolve (kui tahate relatiivset teed CWD suhtes)
			
 
				+                page_tables.append({
			
 
				+                    "table_idx": table_idx,
			
 
				+                    "csv_file": str(csv_file.resolve().relative_to(Path.cwd().resolve())),
			
 
				+                    "rows": len(table),
			
 
				+                    "cols": len(table[0]) if table else 0,
			
 
				+                })
			
 
				+                
			
 
				+                # VÕI VARIANT 2: Säilita relatiivne tee (Soovitatud)
			
 
				+                # page_tables.append({
			
 
				+                #     "table_idx": table_idx,
			
 
				+                #     "csv_file": str(csv_file),
			
 
				+                #     "rows": len(table),
			
 
				+                #     "cols": len(table[0]) if table else 0,
			
 
				+                # })
			
 
				+                
			
 
				+                # VÕI VARIANT 3: os.path.relpath
			
 
				+                # page_tables.append({
			
 
				+                #     "table_idx": table_idx,
			
 
				+                #     "csv_file": os.path.relpath(csv_file, start=Path.cwd()),
			
 
				+                #     "rows": len(table),
			
 
				+                #     "cols": len(table[0]) if table else 0,
			
 
				+                # })
			
 
				+                
			
 
				+                tables_meta.append({
			
 
				+                    "page": page_num,
			
 
				+                    "table_file": str(csv_file),  # Või samuti parandada
			
 
				+                })
			
 
				+            
			
 
				+            # Lehekülje JSON
			
 
				+            pages_data.append({
			
 
				+                "page": page_num,
			
 
				+                "text": text,
			
 
				+                "words_count": len(words),
			
 
				+                "tables": page_tables,
			
 
				+                "width": page.width,
			
 
				+                "height": page.height,
			
 
				+            })
			
 
				+            
			
 
				+            print(f"  ✓ Leht {page_num}: {len(text)} tähte, {len(tables)} tabel(it)")
			
 
				+    
			
 
				+    # Salvestada lehekülgi JSON-ks
			
 
				+    json_file = output_subdir / "pages.json"
			
 
				+    json_file.write_text(json.dumps(pages_data, ensure_ascii=False, indent=2))
			
 
				+    
			
 
				+    tables_json = output_subdir / "tables_meta.json"
			
 
				+    tables_json.write_text(json.dumps(tables_meta, ensure_ascii=False, indent=2))
			
 
				+    
			
 
				+    return {
			
 
				+        "filename": pdf_path.name,
			
 
				+        "pages": len(pdf.pages),
			
 
				+        "json_path": str(json_file),
			
 
				+        "tables_count": table_count,
			
 
				+        "output_dir": str(output_subdir),
			
 
				+    }
			
 
				+
			
 
				+def extract_all_pdfs():
			
 
				+    """Teisendab kõik PDF-d data/raw_pdfs-st"""
			
 
				+    pdf_files = list(PDF_INPUT_DIR.glob("*.pdf"))
			
 
				+
			
 
				+    if not pdf_files:
			
 
				+        print("❌ PDF-e ei leitud!", PDF_INPUT_DIR)
			
 
				+        return
			
 
				+
			
 
				+    conn = get_db_conn()
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    skipped = 0
			
 
				+    processed = 0
			
 
				+
			
 
				+    for pdf_path in pdf_files:
			
 
				+        filename = pdf_path.name
			
 
				+        file_hash = get_file_hash(pdf_path)
			
 
				+        file_size = pdf_path.stat().st_size
			
 
				+
			
 
				+        # Kontrollime duplikaate (failinimi + hash)
			
 
				+        cur.execute(
			
 
				+            "SELECT id FROM raw_documents WHERE filename = %s AND file_hash = %s",
			
 
				+            (filename, file_hash)
			
 
				+        )
			
 
				+        existing = cur.fetchone()
			
 
				+
			
 
				+        if existing:
			
 
				+            print(f"⏭️  {filename} (hash: {file_hash[:8]}...) on juba olemas. Vahele jäetud.")
			
 
				+            skipped += 1
			
 
				+            continue
			
 
				+
			
 
				+        # Ekstraheerime
			
 
				+        result = extract_single_pdf(pdf_path)
			
 
				+
			
 
				+        # Salvestame andmebaasi
			
 
				+        cur.execute("""
			
 
				+        INSERT INTO raw_documents 
			
 
				+        (filename, file_hash, pages, file_size_bytes, extracted_json_path, extracted_tables_dir, extracted_at)
			
 
				+        VALUES (%s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP)
			
 
				+        RETURNING id
			
 
				+        """, (
			
 
				+            filename,
			
 
				+            file_hash,
			
 
				+            result["pages"],
			
 
				+            file_size,
			
 
				+            result["json_path"],
			
 
				+            result["output_dir"],
			
 
				+        ))
			
 
				+        doc_id = cur.fetchone()[0]
			
 
				+        conn.commit()
			
 
				+
			
 
				+        print(f"✓ {filename} salvestatud (DB ID: {doc_id}, hash: {file_hash[:8]}...)")
			
 
				+        processed += 1
			
 
				+
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+    print(f"\n✓ Kokkuvõte: {processed} uus, {skipped} juba olemas")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    extract_all_pdfs()
			
--- a/src/find_duplicates.py
+++ b/src/find_duplicates.py
@@ -0,0 +1,43 @@
 
				+# file: src/find_duplicates.py
			
 
				+
			
 
				+import os
			
 
				+from dotenv import load_dotenv
			
 
				+import psycopg2
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+def find_duplicates():
			
 
				+    conn = psycopg2.connect(
			
 
				+        host=os.getenv("DB_HOST", "localhost"),
			
 
				+        port=os.getenv("DB_PORT", "5432"),
			
 
				+        database=os.getenv("DB_NAME", "pdf_research"),
			
 
				+        user=os.getenv("DB_USER", "pdf_user"),
			
 
				+        password=os.getenv("DB_PASSWORD"),
			
 
				+    )
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    # Leia kaks korda ekstraktitud failid (sama nimi + hash)
			
 
				+    cur.execute("""
			
 
				+    SELECT filename, file_hash, COUNT(*) as count, array_agg(id) as ids
			
 
				+    FROM raw_documents
			
 
				+    WHERE file_hash IS NOT NULL
			
 
				+    GROUP BY filename, file_hash
			
 
				+    HAVING COUNT(*) > 1
			
 
				+    """)
			
 
				+    duplicates = cur.fetchall()
			
 
				+
			
 
				+    if duplicates:
			
 
				+        print("⚠️  LEITUD DUPLIKAADID:")
			
 
				+        for filename, file_hash, count, ids in duplicates:
			
 
				+            print(f"  - {filename} ({count}x)")
			
 
				+            print(f"    Hash: {file_hash}")
			
 
				+            print(f"    DB IDs: {ids}")
			
 
				+            print()
			
 
				+    else:
			
 
				+        print("✓ Duplikaate ei leitud!")
			
 
				+
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    find_duplicates()
			
--- a/src/generate_answer.py
+++ b/src/generate_answer.py
@@ -0,0 +1,212 @@
 
				+# file: src/generate_answer.py
			
 
				+import os
			
 
				+import json
			
 
				+import requests
			
 
				+from typing import List, Dict
			
 
				+from dotenv import load_dotenv
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+LLAMA_CPP_URL = os.getenv("LLAMA_CPP_URL", "http://localhost:8070/completion")
			
 
				+
			
 
				+def deduplicate_chunks(chunks: List[Dict]) -> List[Dict]:
			
 
				+    """Eemaldab duplikaadi chunk'id."""
			
 
				+    seen = set()
			
 
				+    unique_chunks = []
			
 
				+    
			
 
				+    for chunk in chunks:
			
 
				+        key = (chunk['filename'], chunk['page'], chunk['text'][:50])
			
 
				+        if key not in seen:
			
 
				+            seen.add(key)
			
 
				+            unique_chunks.append(chunk)
			
 
				+    
			
 
				+    return unique_chunks
			
 
				+
			
 
				+def build_context(articles: List[Dict], chunks: List[Dict], max_chars: int = 8000) -> str:
			
 
				+    """
			
 
				+    Suurendatud kontekst. Weaviate andmete osa oluliselt suurem.
			
 
				+    """
			
 
				+    context_parts = []
			
 
				+    char_count = 0
			
 
				+    
			
 
				+    unique_chunks = deduplicate_chunks(chunks)
			
 
				+    
			
 
				+    # SUURENDATUD: Weaviate artikli metaandmed
			
 
				+    context_parts.append("=== ARTICLE SUMMARIES FROM WEAVIATE ===\n")
			
 
				+    for i, art in enumerate(articles[:8], 1):  # 5 → 8 artiklit!
			
 
				+        
			
 
				+        # Koosta rikkalik artikli info
			
 
				+        authors_str = ', '.join(art['authors'][:3]) if art['authors'] else 'N/A'
			
 
				+        keywords_str = ', '.join(art['key_concepts'][:6]) if art['key_concepts'] else 'N/A'
			
 
				+        methods_str = ', '.join(art['methods_used'][:4]) if art.get('methods_used') else 'N/A'
			
 
				+        
			
 
				+        # Parsime transport_context (JSON string)
			
 
				+        transport_str = "N/A"
			
 
				+        try:
			
 
				+            if art.get('transport_context'):
			
 
				+                import json
			
 
				+                tc = json.loads(art['transport_context'])
			
 
				+                theoretical = tc.get('theoretical_contribution', '')
			
 
				+                practical = tc.get('practical_applicability', '')
			
 
				+                transport_str = f"Theoretical: {theoretical[:200]}... Practical: {practical[:200]}..."
			
 
				+        except:
			
 
				+            transport_str = art.get('transport_context', '')[:300]
			
 
				+        
			
 
				+        article_text = f"""[Article {i}] {art['title']}
			
 
				+Authors: {authors_str}
			
 
				+Year: {art.get('year', 'N/A')}
			
 
				+Keywords: {keywords_str}
			
 
				+Methods Used: {methods_str}
			
 
				+Transport Context: {transport_str}
			
 
				+Summary (Estonian): {art['summary_et'][:900]}  ← 350 → 900 märki!
			
 
				+
			
 
				+"""
			
 
				+        context_parts.append(article_text)
			
 
				+        char_count += len(article_text)
			
 
				+        
			
 
				+        if char_count > max_chars * 0.45:  # 45% artiklitele
			
 
				+            break
			
 
				+    
			
 
				+    # pgvector excerpts (sama mis enne)
			
 
				+    context_parts.append("\n=== TEXT EXCERPTS FROM SOURCES ===\n")
			
 
				+    for i, chunk in enumerate(unique_chunks[:15], 1):
			
 
				+        chunk_text = f"""[Excerpt {i}] ({chunk['filename']}, page {chunk['page']})
			
 
				+{chunk['text'][:280]}
			
 
				+
			
 
				+"""
			
 
				+        context_parts.append(chunk_text)
			
 
				+        char_count += len(chunk_text)
			
 
				+        
			
 
				+        if char_count > max_chars:
			
 
				+            break
			
 
				+    
			
 
				+    return "\n".join(context_parts)
			
 
				+
			
 
				+def generate_answer(query: str, context: str, temperature: float = 0.6, max_tokens: int = 1500) -> str:
			
 
				+    """
			
 
				+    Genereerib vastuse. Sama kui enne, aga kontekst on rikkam.
			
 
				+    """
			
 
				+    
			
 
				+    prompt = f"""You are an expert in scientific research on transportation safety and traffic engineering.
			
 
				+
			
 
				+CONTEXT (from scientific sources in Weaviate database):
			
 
				+{context}
			
 
				+
			
 
				+USER QUESTION (in Estonian):
			
 
				+{query}
			
 
				+
			
 
				+INSTRUCTIONS:
			
 
				+- Answer in fluent Estonian language
			
 
				+- Use data and findings from the sources provided above
			
 
				+- Cite articles or excerpts by number in square brackets (e.g., "[Article 1]" or "[Excerpt 3]")
			
 
				+- Be concrete, detailed, and factual
			
 
				+- Write at least 250 words (important: use the context fully)
			
 
				+- Do NOT speculate or add information not present in the context
			
 
				+- Reference the methods, findings, and practical applications mentioned in the articles
			
 
				+- Structure your answer clearly with paragraphs
			
 
				+
			
 
				+ANSWER (in Estonian):"""
			
 
				+
			
 
				+    payload = {
			
 
				+        "prompt": prompt,
			
 
				+        "n_predict": max_tokens,
			
 
				+        "temperature": temperature,
			
 
				+        "top_p": 0.95,
			
 
				+        "repeat_penalty": 1.2,
			
 
				+        "presence_penalty": 0.1,
			
 
				+        "frequency_penalty": 0.1,
			
 
				+    }
			
 
				+
			
 
				+    print(f"📤 Saandan päring llama.cpp'le...")
			
 
				+    print(f"   📊 Konteksti suurus: {len(context)} märki ≈ {len(context)//4} tokenit")
			
 
				+    
			
 
				+    try:
			
 
				+        response = requests.post(LLAMA_CPP_URL, json=payload, timeout=240)
			
 
				+        response.raise_for_status()
			
 
				+        result = response.json()
			
 
				+        
			
 
				+        answer = result.get("content", "").strip()
			
 
				+        tokens_predicted = result.get("tokens_predicted", 0)
			
 
				+        
			
 
				+        print(f"   ✅ Genereeriud: {tokens_predicted} tokenit, {len(answer.split())} sõna")
			
 
				+        
			
 
				+        if not answer:
			
 
				+            return "⚠️ LLM tagastas tühja vastuse."
			
 
				+        
			
 
				+        word_count = len(answer.split())
			
 
				+        if word_count < 100:
			
 
				+            print(f"   ⚠️  Liiga lühike ({word_count} sõna) – konteksti ei piisa")
			
 
				+            return f"⚠️ LLM vastus on liiga lühike ({word_count} sõna):\n\n{answer}"
			
 
				+        
			
 
				+        return answer
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        return f"❌ Viga: {e}"
			
 
				+
			
 
				+def rag_query(query: str, articles: List[Dict], chunks: List[Dict], 
			
 
				+              temperature: float = 0.6, max_tokens: int = 1500) -> Dict:
			
 
				+    """RAG tsükkel."""
			
 
				+    print(f"\n{'='*70}")
			
 
				+    print(f"🔍 RAG PÄRING")
			
 
				+    print(f"{'='*70}")
			
 
				+    print(f"Küsimus: {query}")
			
 
				+    print(f"Artikleid: {len(articles)}, Chunk'e: {len(chunks)}")
			
 
				+    
			
 
				+    context = build_context(articles, chunks, max_chars=8000)  # 5000 → 8000
			
 
				+    answer = generate_answer(query, context, temperature=temperature, max_tokens=max_tokens)
			
 
				+    
			
 
				+    # DEBUG: salvesta kontekst
			
 
				+    with open("/tmp/rag_context_debug.txt", "w") as f:
			
 
				+        f.write(context)
			
 
				+    
			
 
				+    return {
			
 
				+        "query": query,
			
 
				+        "answer": answer,
			
 
				+        "sources": {
			
 
				+            "articles": [
			
 
				+                {
			
 
				+                    "title": a["title"], 
			
 
				+                    "authors": a.get("authors", [])[:3],
			
 
				+                    "score": a.get("score", 0)
			
 
				+                } 
			
 
				+                for a in articles[:8]  # 5 → 8
			
 
				+            ],
			
 
				+            "chunks_used": len(set((c['filename'], c['page']) for c in chunks[:15]))
			
 
				+        },
			
 
				+        "parameters": {
			
 
				+            "context_chars": len(context),
			
 
				+            "context_tokens_approx": len(context) // 4,
			
 
				+            "context_weaviate_chars": len("\n".join([a['summary_et'][:900] for a in articles[:8]])),
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    from src.query_hybrid import hybrid_search
			
 
				+    
			
 
				+    query = "young driver accident risk"
			
 
				+    query = "traffic flow"
			
 
				+    query = "road safety"
			
 
				+    print(f"\n🚀 Käivitan hübriidotsingu...")
			
 
				+    
			
 
				+    results = hybrid_search(query, top_articles=10, top_chunks=20)
			
 
				+    
			
 
				+    print(f"\n🤖 Genereerin RAG vastust...")
			
 
				+    rag_result = rag_query(
			
 
				+        query=results["query"],
			
 
				+        articles=results["articles"],
			
 
				+        chunks=results["chunks"],
			
 
				+        temperature=0.6,
			
 
				+        max_tokens=1500
			
 
				+    )
			
 
				+    
			
 
				+    print("\n" + "=" * 70)
			
 
				+    print("🎯 RAG VASTUS:")
			
 
				+    print("=" * 70)
			
 
				+    print(rag_result["answer"])
			
 
				+    
			
 
				+    print("\n" + "=" * 70)
			
 
				+    print("📊 STATISTIKA:")
			
 
				+    print("=" * 70)
			
 
				+    print(f"Weaviate konteksti: {rag_result['parameters']['context_weaviate_chars']} märki")
			
 
				+    print(f"Kokku konteksti: {rag_result['parameters']['context_chars']} märki")
			
 
				+    print(f"Kokku tokenit: ~{rag_result['parameters']['context_tokens_approx']}")
			
--- a/src/query_hybrid.py
+++ b/src/query_hybrid.py
@@ -0,0 +1,214 @@
 
				+# file: src/query_hybrid.py
			
 
				+import os
			
 
				+from dotenv import load_dotenv
			
 
				+import psycopg2
			
 
				+from weaviate import WeaviateClient
			
 
				+from weaviate.connect import ConnectionParams
			
 
				+from src.embed_utils import get_embedding
			
 
				+import json
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+DB_HOST = os.getenv("DB_HOST", "localhost")
			
 
				+DB_PORT = os.getenv("DB_PORT", "5432")
			
 
				+DB_NAME = os.getenv("DB_NAME", "pdf_research")
			
 
				+DB_USER = os.getenv("DB_USER", "pdf_user")
			
 
				+DB_PASSWORD = os.getenv("DB_PASSWORD")
			
 
				+
			
 
				+WEAVIATE_HOST = os.getenv("WEAVIATE_HOST", "localhost")
			
 
				+WEAVIATE_HTTP_PORT = int(os.getenv("WEAVIATE_HTTP_PORT", "8080"))
			
 
				+WEAVIATE_CLASS = os.getenv("WEAVIATE_CLASS", "ScientificArticle")
			
 
				+
			
 
				+def get_db_conn():
			
 
				+    return psycopg2.connect(
			
 
				+        host=DB_HOST,
			
 
				+        port=DB_PORT,
			
 
				+        database=DB_NAME,
			
 
				+        user=DB_USER,
			
 
				+        password=DB_PASSWORD,
			
 
				+    )
			
 
				+
			
 
				+def get_weaviate_client() -> WeaviateClient:
			
 
				+    """Weaviate v4 klient - HTTP ainult"""
			
 
				+    client = WeaviateClient(connection_params=ConnectionParams.from_params(
			
 
				+        http_host=WEAVIATE_HOST,
			
 
				+        http_port=WEAVIATE_HTTP_PORT,
			
 
				+        http_secure=False,
			
 
				+        grpc_host=WEAVIATE_HOST,
			
 
				+        grpc_port=50051,
			
 
				+        grpc_secure=False,
			
 
				+    ))
			
 
				+    client.connect()
			
 
				+    return client
			
 
				+
			
 
				+def search_weaviate_articles_all(limit: int = 10000) -> list[dict]:
			
 
				+    """
			
 
				+    Loeb KÕIK artiklid Weaviate'st (fetch_objects),
			
 
				+    ei kasuta near_text (mis vajavad Ollama embedding'uid).
			
 
				+    """
			
 
				+    client = get_weaviate_client()
			
 
				+
			
 
				+    try:
			
 
				+        collection = client.collections.get(WEAVIATE_CLASS)
			
 
				+        
			
 
				+        # Loeme kõik objektid ilma otsinguta
			
 
				+        results = collection.query.fetch_objects(
			
 
				+            limit=limit,
			
 
				+            return_properties=["title", "source_file", "summary_et", "key_concepts", 
			
 
				+                             "authors", "transport_context", "relevance_score", "abstract_en"]
			
 
				+        )
			
 
				+
			
 
				+        articles = []
			
 
				+        for obj in results.objects:
			
 
				+            props = obj.properties
			
 
				+            articles.append({
			
 
				+                "title": props.get("title", "N/A"),
			
 
				+                "article_id": str(obj.uuid),
			
 
				+                "summary_et": (props.get("summary_et", "") or "")[:500],
			
 
				+                "key_concepts": props.get("key_concepts", []),
			
 
				+                "authors": props.get("authors", []),
			
 
				+                "transport_context": props.get("transport_context", ""),
			
 
				+                "relevance_score": props.get("relevance_score", 0),
			
 
				+                "abstract_en": (props.get("abstract_en", "") or "")[:500],
			
 
				+                "source_file": props.get("source_file", ""),
			
 
				+            })
			
 
				+
			
 
				+        return articles
			
 
				+
			
 
				+    except Exception as e:
			
 
				+        print(f"❌ Weaviate päringu viga: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        return []
			
 
				+    finally:
			
 
				+        client.close()
			
 
				+
			
 
				+def search_articles_by_query(query: str, all_articles: list[dict], limit: int = 10) -> list[dict]:
			
 
				+    """
			
 
				+    Otsib artikleid query abil, kasutades lokaalseid embeddings'eid (bge-small).
			
 
				+    Arvutab sarnasuse vektoritega.
			
 
				+    """
			
 
				+    query_emb = get_embedding(query)
			
 
				+    
			
 
				+    # Otsime artiklite summary_et ja abstract_en seast
			
 
				+    scored_articles = []
			
 
				+    
			
 
				+    for article in all_articles:
			
 
				+        # Võta tekst (summary_et või abstract_en)
			
 
				+        text = article.get("summary_et") or article.get("abstract_en") or ""
			
 
				+        
			
 
				+        if not text:
			
 
				+            continue
			
 
				+        
			
 
				+        # Arvuta embedding
			
 
				+        text_emb = get_embedding(text)
			
 
				+        
			
 
				+        # Arvuta cosine similarity (dot product normaliseeritud vektorite jaoks)
			
 
				+        similarity = sum(a * b for a, b in zip(query_emb, text_emb))
			
 
				+        
			
 
				+        scored_articles.append({
			
 
				+            **article,
			
 
				+            "score": similarity
			
 
				+        })
			
 
				+    
			
 
				+    # Sorteeri ja võta top N
			
 
				+    scored_articles.sort(key=lambda x: x["score"], reverse=True)
			
 
				+    return scored_articles[:limit]
			
 
				+
			
 
				+def search_pgvector_chunks(query: str, article_ids: list[str], limit: int = 20) -> list[dict]:
			
 
				+    """
			
 
				+    Otsib pgvectorist chunkid, mis kuuluvad antud artiklitele.
			
 
				+    """
			
 
				+    if not article_ids:
			
 
				+        return []
			
 
				+
			
 
				+    conn = get_db_conn()
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    query_emb = get_embedding(query)
			
 
				+
			
 
				+    cur.execute("""
			
 
				+        SELECT 
			
 
				+            c.id,
			
 
				+            c.text,
			
 
				+            c.page,
			
 
				+            c.chunk_index,
			
 
				+            r.filename,
			
 
				+            r.weaviate_article_id,
			
 
				+            1 - (c.embedding <=> %s::vector) AS similarity
			
 
				+        FROM chunks c
			
 
				+        JOIN raw_documents r ON c.raw_doc_id = r.id
			
 
				+        WHERE r.weaviate_article_id = ANY(%s::UUID[])
			
 
				+        ORDER BY c.embedding <=> %s::vector
			
 
				+        LIMIT %s
			
 
				+    """, (query_emb, article_ids, query_emb, limit))
			
 
				+
			
 
				+    chunks = []
			
 
				+    for row in cur.fetchall():
			
 
				+        chunks.append({
			
 
				+            "chunk_id": row[0],
			
 
				+            "text": row[1],
			
 
				+            "page": row[2],
			
 
				+            "chunk_index": row[3],
			
 
				+            "filename": row[4],
			
 
				+            "weaviate_article_id": str(row[5]) if row[5] else None,
			
 
				+            "similarity": float(row[6]),
			
 
				+        })
			
 
				+
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+    return chunks
			
 
				+
			
 
				+def hybrid_search(query: str, top_articles: int = 10, top_chunks: int = 20) -> dict:
			
 
				+    """
			
 
				+    Kaheastmeline otsing:
			
 
				+    1. Leia top artikleid Weaviate'st (lokaalsete embeddings'idega)
			
 
				+    2. Leia nende artiklite chunkide seast parimad pgvectorist
			
 
				+    """
			
 
				+    print(f"\n🔍 Hübriidotsing: '{query}'\n")
			
 
				+
			
 
				+    # Samm 1: Loe kõik artiklid Weaviate'st
			
 
				+    print("Samm 1: Loeme artikleid Weaviate'st...")
			
 
				+    all_articles = search_weaviate_articles_all(limit=10000)
			
 
				+    print(f"✓ Leitud {len(all_articles)} artiklit kokku")
			
 
				+
			
 
				+    # Samm 2: Otsime query abil (lokaalsed embeddings)
			
 
				+    print(f"Samm 2: Otsing ({len(all_articles)} artikli seast)...")
			
 
				+    articles = search_articles_by_query(query, all_articles, limit=top_articles)
			
 
				+    print(f"✓ Leitud {len(articles)} asjakohast artiklit")
			
 
				+
			
 
				+    if articles:
			
 
				+        for i, art in enumerate(articles, 1):
			
 
				+            print(f"  {i}. {art['title'][:60]}...")
			
 
				+            print(f"     Relevants: {art['relevance_score']:.1f} | Score: {art['score']:.3f}")
			
 
				+
			
 
				+    # Samm 3: pgvector chunk otsing
			
 
				+    print(f"\nSamm 3: Otsing pgvector'ist (chunk tase)...")
			
 
				+    article_ids = [a["article_id"] for a in articles if a["article_id"]]
			
 
				+    
			
 
				+    if article_ids:
			
 
				+        chunks = search_pgvector_chunks(query, article_ids, limit=top_chunks)
			
 
				+        print(f"✓ Leitud {len(chunks)} chunk'i")
			
 
				+
			
 
				+        if chunks:
			
 
				+            for i, chunk in enumerate(chunks[:5], 1):
			
 
				+                print(f"  {i}. {chunk['filename']} (leht {chunk['page']}, sim: {chunk['similarity']:.3f})")
			
 
				+                print(f"     \"{chunk['text'][:100]}...\"")
			
 
				+    else:
			
 
				+        chunks = []
			
 
				+        print(f"⚠️ Ühtegi artiklit ei leitud, chunkide otsingut ei saa teha")
			
 
				+
			
 
				+    return {
			
 
				+        "query": query,
			
 
				+        "articles": articles,
			
 
				+        "chunks": chunks,
			
 
				+    }
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    #results = hybrid_search("young driver accident risk", top_articles=10, top_chunks=20)
			
 
				+    results = hybrid_search("modelling traffic volume in rural areas", top_articles=10, top_chunks=20)
			
 
				+
			
 
				+    print("\n" + "=" * 60)
			
 
				+    print("TÄIELIK TULEMUS:")
			
 
				+    print(json.dumps(results, default=str, indent=2, ensure_ascii=False)[:3000])
			
--- a/src/rag_api.py
+++ b/src/rag_api.py
@@ -0,0 +1,276 @@
 
				+# file: src/rag_api.py
			
 
				+"""
			
 
				+RAG API - FastAPI server teadusartiklite otsingule ja LLM vastusetele.
			
 
				+Variant A: Weaviate (metaandmed) + pgvector (chunkid) + unsloth LLM (vastused)
			
 
				+"""
			
 
				+
			
 
				+import os
			
 
				+import json
			
 
				+import logging
			
 
				+from typing import List, Optional
			
 
				+from datetime import datetime
			
 
				+
			
 
				+from fastapi import FastAPI, HTTPException, Query
			
 
				+from fastapi.responses import JSONResponse
			
 
				+from fastapi.middleware.cors import CORSMiddleware
			
 
				+from pydantic import BaseModel
			
 
				+
			
 
				+from dotenv import load_dotenv
			
 
				+
			
 
				+from src.query_hybrid import hybrid_search
			
 
				+from src.generate_answer import rag_query
			
 
				+
			
 
				+# Setup logging
			
 
				+logging.basicConfig(level=logging.INFO)
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+# FastAPI app
			
 
				+app = FastAPI(
			
 
				+    title="RAG API - Transportation Safety Research",
			
 
				+    description="Hübriidotsing teadusartiklites + LLM vastuste genereerimine",
			
 
				+    version="1.0.0"
			
 
				+)
			
 
				+
			
 
				+# CORS middleware
			
 
				+app.add_middleware(
			
 
				+    CORSMiddleware,
			
 
				+    allow_origins=["*"],
			
 
				+    allow_credentials=True,
			
 
				+    allow_methods=["*"],
			
 
				+    allow_headers=["*"],
			
 
				+)
			
 
				+
			
 
				+# Pydantic models
			
 
				+class RAGRequest(BaseModel):
			
 
				+    """RAG päringule sissetulek"""
			
 
				+    query: str = Query(..., min_length=5, max_length=500, description="Küsimus eesti keeles")
			
 
				+    top_articles: int = Query(10, ge=1, le=20, description="Mitu artiklit otsitakse (1-20)")
			
 
				+    top_chunks: int = Query(20, ge=1, le=50, description="Mitu chunki otsitakse (1-50)")
			
 
				+    temperature: float = Query(0.6, ge=0.1, le=1.0, description="LLM temperature (0.1-1.0)")
			
 
				+    max_tokens: int = Query(1500, ge=500, le=3000, description="Max tokens vastusele (500-3000)")
			
 
				+
			
 
				+class Article(BaseModel):
			
 
				+    """Artikli metaandmed"""
			
 
				+    title: str
			
 
				+    article_id: str
			
 
				+    authors: List[str]
			
 
				+    relevance_score: float
			
 
				+    score: float
			
 
				+
			
 
				+class Chunk(BaseModel):
			
 
				+    """Tekstilõik"""
			
 
				+    chunk_id: int
			
 
				+    filename: str
			
 
				+    page: int
			
 
				+    text: str
			
 
				+    similarity: float
			
 
				+
			
 
				+class RAGResponse(BaseModel):
			
 
				+    """RAG vastuse struktuur"""
			
 
				+    query: str
			
 
				+    answer: str
			
 
				+    articles: List[Article]
			
 
				+    chunks: List[Chunk]
			
 
				+    parameters: dict
			
 
				+    timestamp: str
			
 
				+    success: bool
			
 
				+
			
 
				+# API endpoints
			
 
				+
			
 
				+@app.get("/", tags=["Info"])
			
 
				+async def root():
			
 
				+    """RAG API juurepunkt"""
			
 
				+    return {
			
 
				+        "name": "RAG API - Transportation Safety",
			
 
				+        "version": "1.0.0",
			
 
				+        "endpoints": {
			
 
				+            "search": "/search",
			
 
				+            "health": "/health",
			
 
				+            "docs": "/docs"
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+@app.get("/health", tags=["Info"])
			
 
				+async def health():
			
 
				+    """Süsteemi tervisekontroll"""
			
 
				+    return {
			
 
				+        "status": "healthy",
			
 
				+        "timestamp": datetime.now().isoformat(),
			
 
				+        "version": "1.0.0"
			
 
				+    }
			
 
				+
			
 
				+@app.post("/search", response_model=RAGResponse, tags=["RAG"])
			
 
				+async def search_rag(request: RAGRequest) -> RAGResponse:
			
 
				+    """
			
 
				+    RAG otsing: hübriidotsing + LLM vastuse genereerimine
			
 
				+    
			
 
				+    **Protsess:**
			
 
				+    1. Hübriidotsing (Weaviate + pgvector)
			
 
				+    2. LLM vastuse genereerimine kontekstiga
			
 
				+    3. Vastuse tagastamine JSON-na
			
 
				+    
			
 
				+    **Näide:**
			
 
				+    ```json
			
 
				+    {
			
 
				+        "query": "young driver accident risk",
			
 
				+        "top_articles": 10,
			
 
				+        "top_chunks": 20,
			
 
				+        "temperature": 0.6,
			
 
				+        "max_tokens": 1500
			
 
				+    }
			
 
				+    ```
			
 
				+    """
			
 
				+    
			
 
				+    try:
			
 
				+        logger.info(f"🔍 RAG päring: '{request.query}'")
			
 
				+        
			
 
				+        # Samm 1: Hübriidotsing
			
 
				+        logger.info("📚 Hübriidotsing (Weaviate + pgvector)...")
			
 
				+        hybrid_results = hybrid_search(
			
 
				+            query=request.query,
			
 
				+            top_articles=request.top_articles,
			
 
				+            top_chunks=request.top_chunks
			
 
				+        )
			
 
				+        
			
 
				+        # Samm 2: LLM vastuse genereerimine
			
 
				+        logger.info("🤖 LLM vastuse genereerimine...")
			
 
				+        rag_results = rag_query(
			
 
				+            query=hybrid_results["query"],
			
 
				+            articles=hybrid_results["articles"],
			
 
				+            chunks=hybrid_results["chunks"],
			
 
				+            temperature=request.temperature,
			
 
				+            max_tokens=request.max_tokens
			
 
				+        )
			
 
				+        
			
 
				+        # Samm 3: Vastuse formateerimine
			
 
				+        logger.info("✅ Vastus valmis")
			
 
				+        
			
 
				+        # Konverteeri artiklid ja chunkid modellideks
			
 
				+        articles = [
			
 
				+            Article(
			
 
				+                title=a["title"],
			
 
				+                article_id=a["article_id"],
			
 
				+                authors=a.get("authors", [])[:3],
			
 
				+                relevance_score=a.get("relevance_score", 0),
			
 
				+                score=a.get("score", 0)
			
 
				+            )
			
 
				+            for a in hybrid_results["articles"][:8]
			
 
				+        ]
			
 
				+        
			
 
				+        chunks = [
			
 
				+            Chunk(
			
 
				+                chunk_id=c["chunk_id"],
			
 
				+                filename=c["filename"],
			
 
				+                page=c["page"],
			
 
				+                text=c["text"][:500],  # Lühendatud tekst JSON jaoks
			
 
				+                similarity=c["similarity"]
			
 
				+            )
			
 
				+            for c in hybrid_results["chunks"][:10]
			
 
				+        ]
			
 
				+        
			
 
				+        response = RAGResponse(
			
 
				+            query=request.query,
			
 
				+            answer=rag_results["answer"],
			
 
				+            articles=articles,
			
 
				+            chunks=chunks,
			
 
				+            parameters=rag_results["parameters"],
			
 
				+            timestamp=datetime.now().isoformat(),
			
 
				+            success=True
			
 
				+        )
			
 
				+        
			
 
				+        return response
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"❌ Viga: {str(e)}")
			
 
				+        raise HTTPException(
			
 
				+            status_code=500,
			
 
				+            detail={
			
 
				+                "error": str(e),
			
 
				+                "query": request.query,
			
 
				+                "timestamp": datetime.now().isoformat()
			
 
				+            }
			
 
				+        )
			
 
				+
			
 
				+@app.get("/search-simple", tags=["RAG"])
			
 
				+async def search_simple(
			
 
				+    q: str = Query(..., min_length=5, max_length=500, description="Küsimus")
			
 
				+):
			
 
				+    """
			
 
				+    Lihtsustatud RAG otsing (vaikimisi parameetritega)
			
 
				+    
			
 
				+    **Näide:** `/search-simple?q=young%20driver%20risk`
			
 
				+    """
			
 
				+    
			
 
				+    try:
			
 
				+        request = RAGRequest(
			
 
				+            query=q,
			
 
				+            top_articles=10,
			
 
				+            top_chunks=20,
			
 
				+            temperature=0.6,
			
 
				+            max_tokens=1500
			
 
				+        )
			
 
				+        
			
 
				+        return await search_rag(request)
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        raise HTTPException(
			
 
				+            status_code=400,
			
 
				+            detail=str(e)
			
 
				+        )
			
 
				+
			
 
				+@app.get("/articles", tags=["Info"])
			
 
				+async def list_articles(
			
 
				+    limit: int = Query(10, ge=1, le=100),
			
 
				+    skip: int = Query(0, ge=0)
			
 
				+):
			
 
				+    """
			
 
				+    Artiklite loend (metadata)
			
 
				+    """
			
 
				+    
			
 
				+    try:
			
 
				+        from src.query_hybrid import search_weaviate_articles_all
			
 
				+        
			
 
				+        articles = search_weaviate_articles_all(limit=limit + skip)
			
 
				+        articles = articles[skip:skip + limit]
			
 
				+        
			
 
				+        return {
			
 
				+            "total": len(articles),
			
 
				+            "articles": [
			
 
				+                {
			
 
				+                    "title": a["title"],
			
 
				+                    "authors": a.get("authors", [])[:3],
			
 
				+                    "relevance_score": a.get("relevance_score", 0),
			
 
				+                    "year": a.get("year", "N/A")
			
 
				+                }
			
 
				+                for a in articles
			
 
				+            ]
			
 
				+        }
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        raise HTTPException(status_code=500, detail=str(e))
			
 
				+
			
 
				+# Error handlers
			
 
				+
			
 
				+@app.exception_handler(HTTPException)
			
 
				+async def http_exception_handler(request, exc):
			
 
				+    """Custom HTTP exception handler"""
			
 
				+    return JSONResponse(
			
 
				+        status_code=exc.status_code,
			
 
				+        content={
			
 
				+            "error": exc.detail,
			
 
				+            "timestamp": datetime.now().isoformat()
			
 
				+        }
			
 
				+    )
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    import uvicorn
			
 
				+    
			
 
				+    # Käivita FastAPI server
			
 
				+    uvicorn.run(
			
 
				+        app,
			
 
				+        host="0.0.0.0",
			
 
				+        port=8071,
			
 
				+        log_level="info"
			
 
				+    )
			
--- a/src/sync_weaviate.py
+++ b/src/sync_weaviate.py
@@ -0,0 +1,135 @@
 
				+# file: src/sync_weaviate.py
			
 
				+# file: src/sync_weaviate.py
			
 
				+import os
			
 
				+import uuid
			
 
				+from pathlib import Path
			
 
				+from dotenv import load_dotenv
			
 
				+import psycopg2
			
 
				+from weaviate import WeaviateClient
			
 
				+from weaviate.connect import ConnectionParams
			
 
				+
			
 
				+load_dotenv()
			
 
				+
			
 
				+DB_HOST = os.getenv("DB_HOST", "localhost")
			
 
				+DB_PORT = os.getenv("DB_PORT", "5432")
			
 
				+DB_NAME = os.getenv("DB_NAME", "pdf_research")
			
 
				+DB_USER = os.getenv("DB_USER", "pdf_user")
			
 
				+DB_PASSWORD = os.getenv("DB_PASSWORD")
			
 
				+
			
 
				+# Weaviate - kasuta tegelikku aadressit
			
 
				+WEAVIATE_HOST = os.getenv("WEAVIATE_HOST", "localhost")
			
 
				+WEAVIATE_HTTP_PORT = int(os.getenv("WEAVIATE_HTTP_PORT", "8080"))
			
 
				+WEAVIATE_CLASS = os.getenv("WEAVIATE_CLASS", "ScientificArticle")
			
 
				+
			
 
				+def get_db_conn():
			
 
				+    return psycopg2.connect(
			
 
				+        host=DB_HOST,
			
 
				+        port=DB_PORT,
			
 
				+        database=DB_NAME,
			
 
				+        user=DB_USER,
			
 
				+        password=DB_PASSWORD,
			
 
				+    )
			
 
				+
			
 
				+def get_weaviate_client() -> WeaviateClient:
			
 
				+    """Weaviate v4 klient - HTTP ainult"""
			
 
				+    client = WeaviateClient(connection_params=ConnectionParams.from_params(
			
 
				+        http_host=WEAVIATE_HOST,
			
 
				+        http_port=WEAVIATE_HTTP_PORT,
			
 
				+        http_secure=False,
			
 
				+        grpc_host=WEAVIATE_HOST,
			
 
				+        grpc_port=50051,  # vaikimisi gRPC port, ei pea tööle olema
			
 
				+        grpc_secure=False,
			
 
				+    ))
			
 
				+    client.connect()
			
 
				+    print(f"✓ Ühendatud Weaviate'ga: {WEAVIATE_HOST}:{WEAVIATE_HTTP_PORT}")
			
 
				+    return client
			
 
				+
			
 
				+def sync_weaviate_to_postgres():
			
 
				+    """
			
 
				+    Loeb Weaviate'st kõik ScientificArticle objektid,
			
 
				+    otsib vastavad raw_documents read (source_file järgi),
			
 
				+    ja uuendab weaviate_article_id veeru.
			
 
				+    """
			
 
				+    print("Connecting to Weaviate...")
			
 
				+    client = get_weaviate_client()
			
 
				+    
			
 
				+    try:
			
 
				+        is_ready = client.is_ready()
			
 
				+        print(f"✓ Weaviate on valmis: {is_ready}")
			
 
				+    except Exception as e:
			
 
				+        print(f"❌ Weaviate ühenduse viga: {e}")
			
 
				+        client.close()
			
 
				+        return
			
 
				+
			
 
				+    print(f"Fetching articles from Weaviate (klass: {WEAVIATE_CLASS})...")
			
 
				+    
			
 
				+    try:
			
 
				+        # Weaviate v4: fetch_objects ilma filtrita
			
 
				+        collection = client.collections.get(WEAVIATE_CLASS)
			
 
				+        
			
 
				+        # Fetch all objects - kasuta suurt limitti
			
 
				+        results = collection.query.fetch_objects(
			
 
				+            limit=10000,
			
 
				+            return_properties=["source_file", "title"]
			
 
				+        )
			
 
				+        
			
 
				+        print(f"✓ Leitud {len(results.objects)} artiklit Weaviate'st")
			
 
				+    except Exception as e:
			
 
				+        print(f"❌ Weaviate päringu viga: {e}")
			
 
				+        import traceback
			
 
				+        traceback.print_exc()
			
 
				+        client.close()
			
 
				+        return
			
 
				+
			
 
				+    conn = get_db_conn()
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    synced = 0
			
 
				+    not_found = 0
			
 
				+    errors = 0
			
 
				+
			
 
				+    for obj in results.objects:
			
 
				+        try:
			
 
				+            props = obj.properties
			
 
				+            source_file = props.get("source_file")
			
 
				+            title = props.get("title", "N/A")
			
 
				+            weaviate_id = obj.uuid
			
 
				+
			
 
				+            if not source_file or not weaviate_id:
			
 
				+                print(f"⚠️ Puudub source_file või id: {title}")
			
 
				+                not_found += 1
			
 
				+                continue
			
 
				+
			
 
				+            weaviate_uuid = str(weaviate_id)
			
 
				+            filename = Path(source_file).name
			
 
				+
			
 
				+            cur.execute(
			
 
				+                "SELECT id FROM raw_documents WHERE filename = %s",
			
 
				+                (filename,)
			
 
				+            )
			
 
				+            row = cur.fetchone()
			
 
				+
			
 
				+            if row:
			
 
				+                raw_doc_id = row[0]
			
 
				+                cur.execute(
			
 
				+                    "UPDATE raw_documents SET weaviate_article_id = %s WHERE id = %s",
			
 
				+                    (weaviate_uuid, raw_doc_id)
			
 
				+                )
			
 
				+                synced += 1
			
 
				+                print(f"✓ {title[:50]}... (pg_id={raw_doc_id})")
			
 
				+            else:
			
 
				+                not_found += 1
			
 
				+                print(f"⚠️ Ei leitud pgvector'ist: {filename}")
			
 
				+        except Exception as e:
			
 
				+            errors += 1
			
 
				+            print(f"❌ Viga objektiga: {e}")
			
 
				+
			
 
				+    conn.commit()
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+    client.close()
			
 
				+
			
 
				+    print(f"\n✓ Kokkuvõte: {synced} sünkroniseeritud, {not_found} ei leitud, {errors} viga")
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    sync_weaviate_to_postgres()