Parcourir la source

Initial commit

ardo il y a 3 mois
Parent
commit
d4c68f1e68

+ 8 - 0
.gitignore

@@ -58,3 +58,11 @@ docs/_build/
 # PyBuilder
 target/
 
+# Ardo
+testimiseks/
+data/chunks
+data/extracted
+data/processed
+data/raw_pdfs
+archive/
+output/

+ 87 - 0
LOEMIND.md

@@ -0,0 +1,87 @@
+
+
+# Aktiveeri venv
+source ~/venvs/pdf-env/bin/activate
+cd ~/rag-demo/pdf-pipeline
+export $(grep -v '^#' .env | xargs -d '\n')
+
+
+python -m src.db_schema  # uuesti skeemiga
+# Vektor välja muudatus
+./scripts/db_vector_välja_muutmine.sh
+
+python -m src.extract_pdf  # parandatud ekstraheerimisega
+python -m src.check_status  # parema ülevaatega
+
+python -m src.find_duplicates  # duplikaatide kontroll
+python -m src.clean_and_normalize
+
+python -m src.create_chunks
+python -m src.embed_chunks
+
+# Testimine
+docker exec -it postgres_postgis psql -U osm -d pdf_research -h localhost -c "SELECT COUNT(*) FROM processed_documents;"
+docker exec -it postgres_postgis psql -U osm -d pdf_research -h localhost -c "
+  SELECT id, raw_doc_id, page, LENGTH(content_text) AS len, has_table
+  FROM processed_documents
+  ORDER BY id
+  LIMIT 20;
+"
+
+# Weaviate baasiga sünkroniseerimine
+python -m src.sync_weaviate
+
+python -m src.query_hybrid
+python -m src.generate_answer
+
+# API käivitamine
+python -m src.rag_api
+
+## ==============================================================
+## API päringute näidised
+### 1. **Interactive Docs** (Swagger UI)
+Ava brauseris:
+```
+http://localhost:8071/docs
+```
+### 2. **cURL käsuga**
+```bash
+# Lihtsustatud otsing
+curl "http://localhost:8000/search-simple?q=young%20driver%20risk"
+# POST requestiga (täielik kontroll)
+curl -X POST "http://localhost:8071/search" \
+  -H "Content-Type: application/json" \
+  -d '{
+    "query": "young driver accident risk",
+    "top_articles": 10,
+    "top_chunks": 20,
+    "temperature": 0.6,
+    "max_tokens": 1500
+  }' | jq '.'
+```
+### 3. **Python requestiga**
+
+```python
+import requests
+import json
+
+# POST päring
+response = requests.post(
+    "http://localhost:8071/search",
+    json={
+        "query": "traffic flow prediction",
+        "top_articles": 10,
+        "top_chunks": 20,
+        "temperature": 0.6,
+        "max_tokens": 1500
+    }
+)
+
+result = response.json()
+print("Küsimus:", result["query"])
+print("Vastus:", result["answer"])
+print("Allikad:", result["articles"])
+```
+
+
+

+ 13 - 0
config/sql/kasutaja_ja_skeemi_loomine.sql

@@ -0,0 +1,13 @@
+-- 1. Loo adminkasutaja 'osm' parooliga 'osm'
+-- NB! SUPERUSER annab täielikud õigused; vajadusel kohanda õiguseid kitsamaks
+CREATE ROLE osm
+  WITH
+    LOGIN
+    SUPERUSER
+    CREATEDB
+    CREATEROLE
+    PASSWORD 'osm';
+
+-- 2. Loo database pdf_research
+CREATE DATABASE pdf_research
+    WITH OWNER = osm;

+ 130 - 0
migrate_weaviate.py

@@ -0,0 +1,130 @@
+#!/usr/bin/env python3
+"""
+Weaviate kollektsiooni migratsioon (remote -> local)
+"""
+import weaviate
+import json
+from typing import Optional
+
+# Ühendused
+REMOTE_WEAVIATE_URL = "http://hetzner:9020"
+LOCAL_WEAVIATE_URL = "http://localhost:8080"
+COLLECTION_NAME = "ScientificArticles"
+BATCH_SIZE = 100
+
+def connect_weaviate(url: str):
+    """Ühenda Weaviate'ga"""
+    client = weaviate.Client(url)
+    if not client.is_ready():
+        raise ConnectionError(f"Weaviate pole käimas: {url}")
+    return client
+
+def get_schema(client, collection_name: str):
+    """Tõmba kollektsiooni schema"""
+    schema = client.schema.get()
+    for cls in schema.get("classes", []):
+        if cls["class"] == collection_name:
+            return cls
+    raise ValueError(f"Kollektsiooni {collection_name} pole olemas")
+
+def export_collection(client, collection_name: str):
+    """Ekspordi kõik objektid Remote Weaviate'st"""
+    print(f"📤 Eksporditakse {collection_name} remote'st ({REMOTE_WEAVIATE_URL})...")
+    
+    where_filter = {
+        "path": ["__typename"],
+        "operator": "NotEqual",
+        "valueString": ""
+    }
+    
+    objects = []
+    after = None
+    
+    while True:
+        response = client.data_object.get(
+            collection_name=collection_name,
+            limit=BATCH_SIZE,
+            after=after,
+            where=where_filter
+        )
+        
+        batch = response.get("objects", [])
+        if not batch:
+            break
+        
+        objects.extend(batch)
+        print(f"  ✓ Eksportitud {len(objects)} objekti...")
+        
+        # Jätka järgmiselt batshi
+        if len(batch) < BATCH_SIZE:
+            break
+        after = batch[-1]["id"]
+    
+    print(f"✅ Kokku eksportitud: {len(objects)} objekti")
+    return objects
+
+def import_collection(client, collection_name: str, objects: list):
+    """Importi objektid lokaalse Weaviate'sse"""
+    print(f"📥 Importitakse {collection_name} lokaalse Weaviate'sse...")
+    
+    success_count = 0
+    error_count = 0
+    
+    for i, obj in enumerate(objects, 1):
+        try:
+            # Eemalda ID, et Weaviate saaks luua uue
+            obj_data = obj.copy()
+            obj_id = obj_data.pop("id", None)
+            
+            client.data_object.create(
+                data_object=obj_data.get("properties", {}),
+                class_name=collection_name,
+                uuid=obj_id  # Kasuta sama ID
+            )
+            success_count += 1
+            
+            if i % 50 == 0:
+                print(f"  ✓ Importitud {i}/{len(objects)} objekti...")
+        except Exception as e:
+            error_count += 1
+            print(f"  ❌ Objekt {obj.get('id')}: {str(e)}")
+    
+    print(f"✅ Importimine valmis: {success_count} edu, {error_count} viga")
+    return success_count
+
+def main():
+    try:
+        # Ühenda
+        print("🔗 Ühendatakse remote Weaviate'ga...")
+        remote_client = connect_weaviate(REMOTE_WEAVIATE_URL)
+        
+        print("🔗 Ühendatakse lokaalsesse Weaviate'ga...")
+        local_client = connect_weaviate(LOCAL_WEAVIATE_URL)
+        
+        # Kontrolli kollektsiooni
+        print(f"📋 Kontrollitakse kollektsiooni {COLLECTION_NAME}...")
+        schema = get_schema(remote_client, COLLECTION_NAME)
+        print(f"  ✓ Leitud: {schema['class']}")
+        
+        # Loo kohalik kolleektsioon sama schemaga
+        try:
+            local_client.schema.get()
+            print("⚠️  Lokaalne kolleektsioon võib juba eksisteerida")
+        except:
+            print(f"📝 Luuakse lokaalne kolleektsioon {COLLECTION_NAME}...")
+            local_client.schema.create_class(schema)
+        
+        # Ekspordi + Importi
+        objects = export_collection(remote_client, COLLECTION_NAME)
+        import_collection(local_client, COLLECTION_NAME, objects)
+        
+        print("\n✨ Migratsioon valmis!")
+        
+    except Exception as e:
+        print(f"❌ Viga: {str(e)}")
+        return 1
+    
+    return 0
+
+if __name__ == "__main__":
+    exit(main())

+ 220 - 0
migrate_weaviate_rest.py

@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+"""
+Weaviate migratsioon REST API abil - ainult ScientificArticle
+Eemalda vectorizer konfig, et kohalik Weaviate saaks skeem luua
+"""
+import requests
+import json
+import copy
+
+REMOTE_URL = "http://hetzner:9020"
+LOCAL_URL = "http://localhost:8080"
+COLLECTION_NAME = "ScientificArticle"
+
+def clean_schema(schema: dict):
+    """Eemalda vectorizer/module references"""
+    cleaned = copy.deepcopy(schema)
+    
+    # Eemalda vectorizer konfid
+    cleaned.pop("vectorizer", None)
+    cleaned.pop("vectorizerConfig", None)
+    cleaned.pop("vectorIndexConfig", None)
+    cleaned.pop("vector_config", None)
+    cleaned.pop("vectorIndexType", None)
+    
+    # Eemalda propertytest vectorizer infod
+    for prop in cleaned.get("properties", []):
+        prop.pop("vectorizer", None)
+        prop.pop("vectorizerConfig", None)
+        prop.pop("vectorizer_configs", None)
+    
+    return cleaned
+
+def get_collection_schema(url: str, collection: str):
+    """Tõmba kollektsiooni schema"""
+    print(f"📋 Schema: {collection}...")
+    try:
+        resp = requests.get(f"{url}/v1/schema", timeout=10)
+        schema = resp.json()
+        
+        for cls in schema.get("classes", []):
+            if cls["class"] == collection:
+                print(f"  ✓ Schema leitud")
+                return cls
+        return None
+    except Exception as e:
+        print(f"  ❌ Viga: {e}")
+        return None
+
+def get_objects(url: str, collection: str):
+    """Tõmba kõik objektid REST API abil"""
+    print(f"\n📤 Eksporditakse {collection}...")
+    
+    objects = []
+    offset = 0
+    limit = 100
+    
+    while True:
+        try:
+            resp = requests.get(
+                f"{url}/v1/objects",
+                params={
+                    "class": collection,
+                    "limit": limit,
+                    "offset": offset
+                },
+                timeout=30
+            )
+            
+            if resp.status_code != 200:
+                print(f"  ⚠️  Viga: {resp.status_code}")
+                break
+            
+            data = resp.json()
+            items = data.get("objects", [])
+            
+            if not items:
+                break
+            
+            objects.extend(items)
+            offset += len(items)
+            print(f"  ✓ Tõmmatud: {offset} objekti...")
+            
+            if len(items) < limit:
+                break
+        
+        except Exception as e:
+            print(f"  ❌ Viga: {e}")
+            break
+    
+    print(f"✅ Eksportitud kokku: {len(objects)} objekti\n")
+    return objects
+
+def create_collection_local(url: str, schema: dict):
+    """Loo kolleektsioon lokaalse Weaviate'sse"""
+    collection_name = schema["class"]
+    print(f"📝 Kontrollitakse lokaalne kolleektsioon '{collection_name}'...")
+    
+    try:
+        # Kontrolli kas juba eksisteerib
+        resp = requests.get(f"{url}/v1/schema", timeout=10)
+        existing = resp.json()
+        
+        for cls in existing.get("classes", []):
+            if cls["class"] == collection_name:
+                print(f"  ✓ Kolleektsioon '{collection_name}' juba eksisteerib\n")
+                return True
+        
+        # Puhasta schema (eemalda vectorizer infod)
+        clean = clean_schema(schema)
+        
+        # Loo uus
+        print(f"  📝 Luuakse uus kolleektsioon (ilma vectorizer'ita)...")
+        resp = requests.post(
+            f"{url}/v1/schema",
+            json=clean,
+            timeout=10
+        )
+        
+        if resp.status_code in [200, 201]:
+            print(f"  ✓ Kolleektsioon '{collection_name}' loodud\n")
+            return True
+        else:
+            print(f"  ❌ Viga: {resp.status_code} - {resp.text[:200]}\n")
+            return False
+    
+    except Exception as e:
+        print(f"  ❌ Viga: {e}\n")
+        return False
+
+def import_objects_rest(url: str, collection: str, objects: list):
+    """Importi objektid REST API abil"""
+    print(f"📥 Importitakse {len(objects)} objekti...")
+    
+    success = 0
+    errors = 0
+    
+    for i, obj in enumerate(objects, 1):
+        try:
+            obj_id = obj.get("id")
+            vector = obj.get("vector")
+            properties = obj.get("properties", {})
+            
+            payload = {
+                "class": collection,
+                "id": obj_id,
+                "properties": properties
+            }
+            
+            if vector:
+                payload["vector"] = vector
+            
+            resp = requests.post(
+                f"{url}/v1/objects",
+                json=payload,
+                timeout=10
+            )
+            
+            if resp.status_code in [200, 201]:
+                success += 1
+            else:
+                errors += 1
+                if errors <= 5:
+                    print(f"  ⚠️  Objekt {i}: {resp.status_code}")
+            
+            if i % 50 == 0:
+                print(f"  ✓ Importitud {i}/{len(objects)}...")
+        
+        except Exception as e:
+            errors += 1
+            if errors <= 5:
+                print(f"  ⚠️  Viga {i}: {e}")
+    
+    print(f"\n✅ Import valmis: {success} edu, {errors} viga\n")
+    return success
+
+def main():
+    print(f"\n{'='*60}")
+    print(f"WEAVIATE MIGRATSIOON: {COLLECTION_NAME}")
+    print(f"{'='*60}\n")
+    
+    print("🔗 Kontrollitakse ühendusi...")
+    
+    try:
+        resp = requests.get(f"{REMOTE_URL}/v1/meta", timeout=5)
+        print(f"  ✓ Remote ({REMOTE_URL}): {resp.json().get('version', 'OK')}")
+    except Exception as e:
+        print(f"  ❌ Remote viga: {e}")
+        return 1
+    
+    try:
+        resp = requests.get(f"{LOCAL_URL}/v1/meta", timeout=5)
+        print(f"  ✓ Lokaalne ({LOCAL_URL}): {resp.json().get('version', 'OK')}\n")
+    except Exception as e:
+        print(f"  ❌ Lokaalne viga: {e}")
+        return 1
+    
+    schema = get_collection_schema(REMOTE_URL, COLLECTION_NAME)
+    if not schema:
+        print(f"❌ Kollektsiooni '{COLLECTION_NAME}' remote'l pole\n")
+        return 1
+    
+    if not create_collection_local(LOCAL_URL, schema):
+        print(f"❌ Lokaalne kolleektsioon ei loodud\n")
+        return 1
+    
+    objects = get_objects(REMOTE_URL, COLLECTION_NAME)
+    
+    if not objects:
+        print(f"⚠️  Kollektsioonis '{COLLECTION_NAME}' pole objekte\n")
+        return 0
+    
+    import_objects_rest(LOCAL_URL, COLLECTION_NAME, objects)
+    
+    print(f"{'='*60}")
+    print("✨ Migratsioon valmis!")
+    print(f"{'='*60}\n")
+    return 0
+
+if __name__ == "__main__":
+    exit(main())

+ 33 - 0
scripts/db_vector_välja_muutmine.sh

@@ -0,0 +1,33 @@
+# Veeru laiuse muudatus
+docker exec -it postgres_postgis bash -c "
+  export PGPASSWORD='osm' &&
+  psql -U osm -h localhost -d pdf_research -c \"
+  ALTER TABLE chunks
+      ALTER COLUMN embedding TYPE vector(384);
+  \"
+"
+
+docker exec -it postgres_postgis bash -c "
+  export PGPASSWORD='osm' &&
+  psql -U osm -h localhost -d pdf_research -c \"
+  DROP INDEX IF EXISTS idx_chunks_embedding;
+  CREATE INDEX idx_chunks_embedding ON chunks
+  USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
+  \"
+"
+
+# Lisa raw_documents tabelisse weaviate_article_id veerg
+docker exec -it postgres_postgis bash -c "
+  export PGPASSWORD='osm' &&
+  psql -U osm -h localhost -d pdf_research -c \"
+ALTER TABLE raw_documents ADD COLUMN weaviate_article_id UUID;
+CREATE INDEX idx_raw_weaviate_id ON raw_documents(weaviate_article_id);
+  \"
+"
+# Kontrollli veeru olemasolu
+docker exec -it postgres_postgis bash -c "
+  export PGPASSWORD='osm' &&
+  psql -U osm -h localhost -d pdf_research -c \"
+  SELECT column_name, data_type FROM information_schema.columns WHERE table_name = 'raw_documents' ORDER BY ordinal_position;
+  \"
+"

+ 87 - 0
src/check_status.py

@@ -0,0 +1,87 @@
+# file: src/check_status.py - PARANDUS
+import os
+import json
+import csv
+from pathlib import Path
+from dotenv import load_dotenv
+import pdfplumber
+import psycopg2
+import hashlib
+
+load_dotenv()
+
+DB_HOST = os.getenv("DB_HOST", "localhost")
+DB_PORT = os.getenv("DB_PORT", "5432")
+DB_NAME = os.getenv("DB_NAME", "pdf_research")
+DB_USER = os.getenv("DB_USER", "pdf_user")
+DB_PASSWORD = os.getenv("DB_PASSWORD")
+
+def check_status():
+    conn = psycopg2.connect(
+        host=DB_HOST,
+        port=DB_PORT,
+        database=DB_NAME,
+        user=DB_USER,
+        password=DB_PASSWORD,
+    )
+    cur = conn.cursor()
+
+    print("=" * 60)
+    print("📊 EKSTRAKTSIOONI STAATUS")
+    print("=" * 60)
+
+    # Raw dokumentid
+    cur.execute("""
+    SELECT 
+        COUNT(*) as doc_count, 
+        SUM(pages) as total_pages,
+        SUM(file_size_bytes) as total_size_bytes
+    FROM raw_documents
+    """)
+    doc_count, total_pages, total_size = cur.fetchone()
+    total_size_mb = (total_size or 0) / (1024 * 1024)
+    print(f"\n✓ Raw dokumentid:")
+    print(f"  - Failid: {doc_count}")
+    print(f"  - Lehti kokku: {total_pages}")
+    print(f"  - Suurus: {total_size_mb:.2f} MB")
+
+    # Töödeldud osad
+    cur.execute("""
+    SELECT 
+        COUNT(*) as total_parts,
+        SUM(CASE WHEN has_table THEN 1 ELSE 0 END) as table_parts,
+        SUM(CASE WHEN NOT has_table THEN 1 ELSE 0 END) as text_parts
+    FROM processed_documents
+    """)
+    total_parts, table_parts, text_parts = cur.fetchone()
+    print(f"\n✓ Töödeldud osad:")
+    print(f"  - Kokku: {total_parts}")
+    print(f"  - Tekstiosad: {text_parts}")
+    print(f"  - Tabeliosad: {table_parts}")
+
+    # Dokumentide detailid
+    print(f"\n✓ Dokumentide loend:")
+    cur.execute("""
+    SELECT 
+        filename, 
+        pages, 
+        file_hash,
+        extracted_at,
+        (SELECT COUNT(*) FROM processed_documents WHERE raw_doc_id = raw_documents.id) as processed_parts
+    FROM raw_documents 
+    ORDER BY extracted_at DESC
+    """)
+    docs = cur.fetchall()
+    for filename, pages, file_hash, extracted_at, proc_parts in docs:
+        hash_short = file_hash[:8] if file_hash else "N/A"
+        print(f"  - {filename}")
+        print(f"    Hash: {hash_short}... | Lehti: {pages} | Töödeldud osad: {proc_parts}")
+        print(f"    Ekstraktitud: {extracted_at}")
+
+    cur.close()
+    conn.close()
+
+    print("\n" + "=" * 60)
+
+if __name__ == "__main__":
+    check_status()

+ 58 - 0
src/chunk_utils.py

@@ -0,0 +1,58 @@
+# file: src/chunk_utils.py
+import re
+from typing import List, Dict
+
+def split_into_sentences(text: str) -> List[str]:
+    # väga lihtne lausejaotus
+    parts = re.split(r'(?<=[.!?])\s+', text.strip())
+    return [p.strip() for p in parts if p.strip()]
+
+def chunk_text(
+    text: str,
+    max_chars: int = 1500,
+    overlap_chars: int = 300
+) -> List[Dict]:
+    """
+    Recursive / semantiline chunking:
+    - proovib hoida lausepiirid
+    - loob chunkid ~max_chars, ülekate overlap_chars
+    """
+    if len(text) <= max_chars:
+        return [{"text": text, "start": 0, "end": len(text)}]
+
+    sentences = split_into_sentences(text)
+    chunks = []
+    current = ""
+    start_idx = 0
+
+    for sent in sentences:
+        if not current:
+            current = sent
+            start_idx = text.find(sent, start_idx)
+            continue
+
+        # Kui uus lause veel mahub
+        if len(current) + 1 + len(sent) <= max_chars:
+            current = current + " " + sent
+        else:
+            end_idx = start_idx + len(current)
+            chunks.append({"text": current, "start": start_idx, "end": end_idx})
+
+            # Ülekate lõpust
+            overlap_start = max(0, end_idx - overlap_chars)
+            current = text[overlap_start:end_idx]
+            start_idx = overlap_start
+
+            # Kui viimane chunk + uus lause on liiga pikk, alustame uuesti
+            if len(current) + 1 + len(sent) > max_chars:
+                chunks.append({"text": current, "start": start_idx, "end": start_idx + len(current)})
+                current = sent
+                start_idx = text.find(sent, end_idx)
+            else:
+                current = current + " " + sent
+
+    if current:
+        end_idx = start_idx + len(current)
+        chunks.append({"text": current, "start": start_idx, "end": end_idx})
+
+    return chunks

+ 105 - 0
src/clean_and_normalize.py

@@ -0,0 +1,105 @@
+# file: src/clean_and_normalize.py
+import os
+import json
+from pathlib import Path
+from dotenv import load_dotenv
+import psycopg2
+
+load_dotenv()
+
+EXTRACTED_OUTPUT_DIR = Path(os.getenv("EXTRACTED_OUTPUT_DIR", "data/extracted"))
+
+DB_HOST = os.getenv("DB_HOST", "localhost")
+DB_PORT = os.getenv("DB_PORT", "5432")
+DB_NAME = os.getenv("DB_NAME", "pdf_research")
+DB_USER = os.getenv("DB_USER", "pdf_user")
+DB_PASSWORD = os.getenv("DB_PASSWORD")
+
+def get_db_conn():
+    return psycopg2.connect(
+        host=DB_HOST,
+        port=DB_PORT,
+        database=DB_NAME,
+        user=DB_USER,
+        password=DB_PASSWORD,
+    )
+
+def clean_text_basic(text: str) -> str:
+    # Väga lihtne puhastus: trimmime, asendame mitmekordsed tühikud ühega
+    import re
+    if not text:
+        return ""
+    text = text.replace("\r", "\n")
+    text = text.replace("\u00a0", " ")  # non‑breaking space
+    text = re.sub(r"[ \t]+", " ", text)
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+
+def process_single_doc(raw_doc_id: int, filename: str):
+    pdf_stem = Path(filename).stem
+    json_path = EXTRACTED_OUTPUT_DIR / pdf_stem / "pages.json"
+
+    if not json_path.exists():
+        print(f"❌ pages.json puudub: {json_path}")
+        return
+
+    pages_data = json.loads(json_path.read_text())
+
+    conn = get_db_conn()
+    cur = conn.cursor()
+
+    inserted = 0
+
+    for page in pages_data:
+        page_num = page["page"]
+        raw_text = page.get("text") or ""
+        text = clean_text_basic(raw_text)
+
+        if not text:
+            print(f"  ⚠️ Leht {page_num}: tühi tekst, jätan vahele")
+            continue
+
+        cur.execute("""
+            INSERT INTO processed_documents
+                (raw_doc_id, page, section, content_text, has_table, table_data, bbox)
+            VALUES
+                (%s, %s, %s, %s, %s, %s, %s)
+        """, (
+            raw_doc_id,
+            page_num,
+            f"page_{page_num}",
+            text,
+            False,
+            None,
+            None,
+        ))
+        inserted += 1
+
+    conn.commit()
+    cur.close()
+    conn.close()
+
+    print(f"✓ {filename}: lisatud {inserted} rida processed_documents tabelisse")
+
+def process_all_docs():
+    conn = get_db_conn()
+    cur = conn.cursor()
+
+    # Võtame kõik raw dokumendid
+    cur.execute("SELECT id, filename FROM raw_documents ORDER BY id")
+    docs = cur.fetchall()
+    cur.close()
+    conn.close()
+
+    if not docs:
+        print("❌ raw_documents on tühi, esmalt käivita extract_pdf.py")
+        return
+
+    for raw_doc_id, filename in docs:
+        print(f"🔧 Töötlen: {filename} (raw_doc_id={raw_doc_id})")
+        process_single_doc(raw_doc_id, filename)
+
+    print("✓ Kõik dokumendid töödeldud (lehe‑tasemel tekstina).")
+
+if __name__ == "__main__":
+    process_all_docs()

+ 115 - 0
src/create_chunks.py

@@ -0,0 +1,115 @@
+# file: src/create_chunks.py
+# file: src/create_chunks.py
+import os
+from dotenv import load_dotenv
+import psycopg2
+import json
+from src.chunk_utils import chunk_text
+
+load_dotenv()
+
+DB_HOST = os.getenv("DB_HOST", "localhost")
+DB_PORT = os.getenv("DB_PORT", "5432")
+DB_NAME = os.getenv("DB_NAME", "pdf_research")
+DB_USER = os.getenv("DB_USER", "pdf_user")
+DB_PASSWORD = os.getenv("DB_PASSWORD")
+
+CHUNK_SIZE = int(os.getenv("CHUNK_SIZE", "1500"))
+CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "300"))
+
+def get_conn():
+    return psycopg2.connect(
+        host=DB_HOST,
+        port=DB_PORT,
+        database=DB_NAME,
+        user=DB_USER,
+        password=DB_PASSWORD,
+    )
+
+def create_chunks_for_doc(raw_doc_id: int, filename: str):
+    conn = get_conn()
+    cur = conn.cursor()
+
+    # Võtame kõik processed_documents read sellest PDF-ist
+    cur.execute("""
+        SELECT id, page, content_text, has_table
+        FROM processed_documents
+        WHERE raw_doc_id = %s
+        ORDER BY page, id
+    """, (raw_doc_id,))
+    rows = cur.fetchall()
+
+    print(f"  -> processed_documents ridu: {len(rows)}")
+
+    chunk_index = 0
+    total_chunks = 0
+
+    for proc_id, page, content_text, has_table in rows:
+        if not content_text or len(content_text.strip()) == 0:
+            print(f"     (id={proc_id}, page={page}) tühi content_text, jätan vahele")
+            continue
+
+        chunks = chunk_text(
+            content_text,
+            max_chars=CHUNK_SIZE,
+            overlap_chars=CHUNK_OVERLAP,
+        )
+
+        print(f"     (id={proc_id}, page={page}) -> {len(chunks)} chunk(i)")
+
+        for ch in chunks:
+            chunk_index += 1
+            meta = {
+                "has_table": bool(has_table),
+                "page": int(page),
+            }
+
+            cur.execute("""
+                INSERT INTO chunks
+                (processed_doc_id, raw_doc_id, filename, chunk_index, page, text, num_tokens, metadata)
+                VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
+            """, (
+                proc_id,
+                raw_doc_id,
+                filename,
+                chunk_index,
+                page,
+                ch["text"],
+                len(ch["text"].split()),
+                json.dumps(meta),  # ← võtmeparandus
+            ))
+            total_chunks += 1
+
+    conn.commit()
+    cur.close()
+    conn.close()
+
+    print(f"✓ {filename}: loodud {total_chunks} chunk(i), viimase indeks: {chunk_index}")
+
+def create_chunks_for_all():
+    conn = get_conn()
+    cur = conn.cursor()
+
+    cur.execute("""
+        SELECT id, filename
+        FROM raw_documents
+        ORDER BY id
+    """)
+    docs = cur.fetchall()
+
+    for raw_doc_id, filename in docs:
+        # Kontroll, kas juba on chunk’e
+        cur.execute("SELECT COUNT(*) FROM chunks WHERE raw_doc_id = %s", (raw_doc_id,))
+        count = cur.fetchone()[0]
+        if count > 0:
+            print(f"⏭️  {filename}: chunkid juba olemas ({count} rida). Vahele jätan.")
+            continue
+
+        print(f"🧩 Loon chunkid: {filename}")
+        create_chunks_for_doc(raw_doc_id, filename)
+
+    cur.close()
+    conn.close()
+
+if __name__ == "__main__":
+    create_chunks_for_all()

+ 103 - 0
src/db_schema.py

@@ -0,0 +1,103 @@
+# file: src/db_schema.py - VECTOR toega
+
+import os
+from dotenv import load_dotenv
+import psycopg2
+
+load_dotenv()
+
+DB_HOST = os.getenv("DB_HOST", "localhost")
+DB_PORT = os.getenv("DB_PORT", "5432")
+DB_NAME = os.getenv("DB_NAME", "pdf_research")
+DB_USER = os.getenv("DB_USER", "pdf_user")
+DB_PASSWORD = os.getenv("DB_PASSWORD")
+
+def create_schema():
+    conn = psycopg2.connect(
+        host=DB_HOST,
+        port=DB_PORT,
+        database=DB_NAME,
+        user=DB_USER,
+        password=DB_PASSWORD,
+    )
+    cur = conn.cursor()
+
+    # Pgvector laiendus
+    try:
+        cur.execute("CREATE EXTENSION IF NOT EXISTS vector")
+        print("✓ pgvector laiendus lubatud")
+    except Exception as e:
+        print(f"⚠️  pgvector laiendus: {e}")
+
+    # Raw dokumentid
+    cur.execute("""
+    CREATE TABLE IF NOT EXISTS raw_documents (
+        id SERIAL PRIMARY KEY,
+        filename VARCHAR(255) NOT NULL,
+        file_hash VARCHAR(32),
+        pages INT,
+        file_size_bytes INT,
+        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
+        extracted_at TIMESTAMP,
+        extracted_json_path VARCHAR(255),
+        extracted_tables_dir VARCHAR(255),
+        UNIQUE(filename, file_hash)
+    )
+    """)
+
+    # Töödeldud dokumendid
+    cur.execute("""
+    CREATE TABLE IF NOT EXISTS processed_documents (
+        id SERIAL PRIMARY KEY,
+        raw_doc_id INT REFERENCES raw_documents(id) ON DELETE CASCADE,
+        page INT NOT NULL,
+        section VARCHAR(100),
+        content_text TEXT,
+        has_table BOOLEAN DEFAULT FALSE,
+        table_data JSONB,
+        bbox JSONB,
+        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+    )
+    """)
+
+    # Chunkid (VECTOR)
+    cur.execute("""
+    CREATE TABLE IF NOT EXISTS chunks (
+        id SERIAL PRIMARY KEY,
+        processed_doc_id INT REFERENCES processed_documents(id) ON DELETE CASCADE,
+        raw_doc_id INT REFERENCES raw_documents(id),
+        filename VARCHAR(255),
+        chunk_index INT,
+        page INT,
+        text TEXT,
+        num_tokens INT,
+        embedding VECTOR(1536),
+        metadata JSONB,
+        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+    )
+    """)
+
+    # Indeksid
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_raw_filename ON raw_documents(filename)")
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_raw_filehash ON raw_documents(file_hash)")
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_proc_raw_id ON processed_documents(raw_doc_id)")
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_chunks_raw_id ON chunks(raw_doc_id)")
+    cur.execute("CREATE INDEX IF NOT EXISTS idx_chunks_filename ON chunks(filename)")
+    
+    # Vector indeks (IVFFlat kiire otsing)
+    try:
+        cur.execute("""
+        CREATE INDEX IF NOT EXISTS idx_chunks_embedding ON chunks 
+        USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100)
+        """)
+        print("✓ Vector indeks loodud")
+    except Exception as e:
+        print(f"⚠️  Vector indeksi loomine: {e}")
+
+    conn.commit()
+    cur.close()
+    conn.close()
+    print("✓ Kõik skeemid loodud!")
+
+if __name__ == "__main__":
+    create_schema()

+ 75 - 0
src/embed_chunks.py

@@ -0,0 +1,75 @@
+# file: src/embed_chunks.py
+# file: src/embed_chunks.py
+import os
+from dotenv import load_dotenv
+import psycopg2
+from src.embed_utils import get_embedding
+
+load_dotenv()
+
+DB_HOST = os.getenv("DB_HOST", "localhost")
+DB_PORT = os.getenv("DB_PORT", "5432")
+DB_NAME = os.getenv("DB_NAME", "pdf_research")
+DB_USER = os.getenv("DB_USER", "pdf_user")
+DB_PASSWORD = os.getenv("DB_PASSWORD")
+
+BATCH_LIMIT = int(os.getenv("EMBED_BATCH_LIMIT", "500000"))
+
+def get_conn():
+    return psycopg2.connect(
+        host=DB_HOST,
+        port=DB_PORT,
+        database=DB_NAME,
+        user=DB_USER,
+        password=DB_PASSWORD,
+    )
+
+def embed_missing_chunks(limit: int | None = None):
+    if limit is None:
+        limit = BATCH_LIMIT
+
+    conn = get_conn()
+    cur = conn.cursor()
+
+    # Võtame chunkid, mille embedding on NULL
+    cur.execute("""
+        SELECT id, text
+        FROM chunks
+        WHERE embedding IS NULL
+        ORDER BY id
+        LIMIT %s
+    """, (limit,))
+    rows = cur.fetchall()
+
+    if not rows:
+        print("✓ Kõik chunkid on juba embedded.")
+        cur.close()
+        conn.close()
+        return
+
+    print(f"Embedding {len(rows)} chunk(i)...")
+
+    updated = 0
+    for chunk_id, text in rows:
+        text = (text or "").strip()
+        if not text:
+            print(f"  ⚠️ chunk id={chunk_id}: tühi tekst, jätan vahele")
+            continue
+
+        emb = get_embedding(text)  # list[float]
+
+        # pgvector oskab Python listi otse võtta, kui laiendus on olemas [web:105][web:107][web:110]
+        cur.execute(
+            "UPDATE chunks SET embedding = %s WHERE id = %s",
+            (emb, chunk_id),
+        )
+        updated += 1
+
+    conn.commit()
+    cur.close()
+    conn.close()
+
+    print(f"✓ Uuendatud {updated} chunk(i) embeddingu veeruga.")
+
+if __name__ == "__main__":
+    embed_missing_chunks()

+ 31 - 0
src/embed_utils.py

@@ -0,0 +1,31 @@
+# file: src/embed_utils.py
+import os
+from dotenv import load_dotenv
+from sentence_transformers import SentenceTransformer  # [web:133][web:134][web:135][web:141]
+
+load_dotenv()
+
+EMBED_MODEL_NAME = os.getenv("EMBED_MODEL_NAME", "BAAI/bge-small-en-v1.5")
+EMBED_DIM = int(os.getenv("EMBED_DIM", "384"))  # bge-small-en-v1.5 [web:134][web:135]
+
+_model: SentenceTransformer | None = None
+
+def get_model() -> SentenceTransformer:
+    global _model
+    if _model is None:
+        print(f"Loading embedding model: {EMBED_MODEL_NAME}")
+        _model = SentenceTransformer(EMBED_MODEL_NAME)
+    return _model
+
+def get_embedding(text: str) -> list[float]:
+    """
+    Tagastab ühe tekstijupi embeddingu.
+    BGE v1.5 ei vaja tingimata special instruction'it, võib kasutada otse. [web:133][web:135][web:140][web:146]
+    """
+    text = (text or "").strip()
+    if not text:
+        return []
+
+    model = get_model()
+    emb = model.encode(text, normalize_embeddings=True)  # normalization sobib cosine jaoks [web:133][web:135][web:141]
+    return emb.tolist()

+ 194 - 0
src/extract_pdf.py

@@ -0,0 +1,194 @@
+# file: src/extract_pdf.py
+import os
+import json
+import csv
+from pathlib import Path
+from dotenv import load_dotenv
+import pdfplumber
+import psycopg2
+import hashlib
+
+load_dotenv()
+
+PDF_INPUT_DIR = Path(os.getenv("PDF_INPUT_DIR", "data/raw_pdfs"))
+EXTRACTED_OUTPUT_DIR = Path(os.getenv("EXTRACTED_OUTPUT_DIR", "data/extracted"))
+
+DB_HOST = os.getenv("DB_HOST", "localhost")
+DB_PORT = os.getenv("DB_PORT", "5432")
+DB_NAME = os.getenv("DB_NAME", "pdf_research")
+DB_USER = os.getenv("DB_USER", "pdf_user")
+DB_PASSWORD = os.getenv("DB_PASSWORD")
+
+def get_db_conn():
+    return psycopg2.connect(
+        host=DB_HOST,
+        port=DB_PORT,
+        database=DB_NAME,
+        user=DB_USER,
+        password=DB_PASSWORD,
+    )
+
+def get_file_hash(file_path: Path) -> str:
+    """Arvutab faili MD5 hash'i"""
+    md5 = hashlib.md5()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    return md5.hexdigest()
+
+def extract_single_pdf(pdf_path: Path):
+    """
+    Ekstraktib PDF-st:
+    - lehekülgede tekstid JSON-ks (paigutuse metaandmetega)
+    - tabelid CSV-ks (lehekülgeti)
+    """
+    pdf_name = pdf_path.stem
+    output_subdir = EXTRACTED_OUTPUT_DIR / pdf_name
+    output_subdir.mkdir(parents=True, exist_ok=True)
+    
+    pages_data = []
+    tables_meta = []
+    table_count = 0
+    
+    print(f"📖 Käsitlen: {pdf_path.name}")
+    
+    with pdfplumber.open(pdf_path) as pdf:
+        for page_num, page in enumerate(pdf.pages, start=1):
+            # Tekst
+            text = page.extract_text() or ""
+            
+            # Sõnad paigutusega (asendab extract_text_dict)
+            words = page.extract_words() or []
+            
+            # Tabelid
+            tables = page.extract_tables()
+            page_tables = []
+            
+            for table_idx, table in enumerate(tables, start=1):
+                table_count += 1
+                csv_file = output_subdir / f"page_{page_num}_table_{table_idx}.csv"
+                
+                with csv_file.open("w", newline="", encoding="utf-8") as f:
+                    writer = csv.writer(f)
+                    for row in table:
+                        writer.writerow(row)
+                
+                # PARANDUS: Valige üks alljärgnevatest
+                
+                # VARIANT 1: Resolve (kui tahate relatiivset teed CWD suhtes)
+                page_tables.append({
+                    "table_idx": table_idx,
+                    "csv_file": str(csv_file.resolve().relative_to(Path.cwd().resolve())),
+                    "rows": len(table),
+                    "cols": len(table[0]) if table else 0,
+                })
+                
+                # VÕI VARIANT 2: Säilita relatiivne tee (Soovitatud)
+                # page_tables.append({
+                #     "table_idx": table_idx,
+                #     "csv_file": str(csv_file),
+                #     "rows": len(table),
+                #     "cols": len(table[0]) if table else 0,
+                # })
+                
+                # VÕI VARIANT 3: os.path.relpath
+                # page_tables.append({
+                #     "table_idx": table_idx,
+                #     "csv_file": os.path.relpath(csv_file, start=Path.cwd()),
+                #     "rows": len(table),
+                #     "cols": len(table[0]) if table else 0,
+                # })
+                
+                tables_meta.append({
+                    "page": page_num,
+                    "table_file": str(csv_file),  # Või samuti parandada
+                })
+            
+            # Lehekülje JSON
+            pages_data.append({
+                "page": page_num,
+                "text": text,
+                "words_count": len(words),
+                "tables": page_tables,
+                "width": page.width,
+                "height": page.height,
+            })
+            
+            print(f"  ✓ Leht {page_num}: {len(text)} tähte, {len(tables)} tabel(it)")
+    
+    # Salvestada lehekülgi JSON-ks
+    json_file = output_subdir / "pages.json"
+    json_file.write_text(json.dumps(pages_data, ensure_ascii=False, indent=2))
+    
+    tables_json = output_subdir / "tables_meta.json"
+    tables_json.write_text(json.dumps(tables_meta, ensure_ascii=False, indent=2))
+    
+    return {
+        "filename": pdf_path.name,
+        "pages": len(pdf.pages),
+        "json_path": str(json_file),
+        "tables_count": table_count,
+        "output_dir": str(output_subdir),
+    }
+
+def extract_all_pdfs():
+    """Teisendab kõik PDF-d data/raw_pdfs-st"""
+    pdf_files = list(PDF_INPUT_DIR.glob("*.pdf"))
+
+    if not pdf_files:
+        print("❌ PDF-e ei leitud!", PDF_INPUT_DIR)
+        return
+
+    conn = get_db_conn()
+    cur = conn.cursor()
+
+    skipped = 0
+    processed = 0
+
+    for pdf_path in pdf_files:
+        filename = pdf_path.name
+        file_hash = get_file_hash(pdf_path)
+        file_size = pdf_path.stat().st_size
+
+        # Kontrollime duplikaate (failinimi + hash)
+        cur.execute(
+            "SELECT id FROM raw_documents WHERE filename = %s AND file_hash = %s",
+            (filename, file_hash)
+        )
+        existing = cur.fetchone()
+
+        if existing:
+            print(f"⏭️  {filename} (hash: {file_hash[:8]}...) on juba olemas. Vahele jäetud.")
+            skipped += 1
+            continue
+
+        # Ekstraheerime
+        result = extract_single_pdf(pdf_path)
+
+        # Salvestame andmebaasi
+        cur.execute("""
+        INSERT INTO raw_documents 
+        (filename, file_hash, pages, file_size_bytes, extracted_json_path, extracted_tables_dir, extracted_at)
+        VALUES (%s, %s, %s, %s, %s, %s, CURRENT_TIMESTAMP)
+        RETURNING id
+        """, (
+            filename,
+            file_hash,
+            result["pages"],
+            file_size,
+            result["json_path"],
+            result["output_dir"],
+        ))
+        doc_id = cur.fetchone()[0]
+        conn.commit()
+
+        print(f"✓ {filename} salvestatud (DB ID: {doc_id}, hash: {file_hash[:8]}...)")
+        processed += 1
+
+    cur.close()
+    conn.close()
+
+    print(f"\n✓ Kokkuvõte: {processed} uus, {skipped} juba olemas")
+
+if __name__ == "__main__":
+    extract_all_pdfs()

+ 43 - 0
src/find_duplicates.py

@@ -0,0 +1,43 @@
+# file: src/find_duplicates.py
+
+import os
+from dotenv import load_dotenv
+import psycopg2
+
+load_dotenv()
+
+def find_duplicates():
+    conn = psycopg2.connect(
+        host=os.getenv("DB_HOST", "localhost"),
+        port=os.getenv("DB_PORT", "5432"),
+        database=os.getenv("DB_NAME", "pdf_research"),
+        user=os.getenv("DB_USER", "pdf_user"),
+        password=os.getenv("DB_PASSWORD"),
+    )
+    cur = conn.cursor()
+
+    # Leia kaks korda ekstraktitud failid (sama nimi + hash)
+    cur.execute("""
+    SELECT filename, file_hash, COUNT(*) as count, array_agg(id) as ids
+    FROM raw_documents
+    WHERE file_hash IS NOT NULL
+    GROUP BY filename, file_hash
+    HAVING COUNT(*) > 1
+    """)
+    duplicates = cur.fetchall()
+
+    if duplicates:
+        print("⚠️  LEITUD DUPLIKAADID:")
+        for filename, file_hash, count, ids in duplicates:
+            print(f"  - {filename} ({count}x)")
+            print(f"    Hash: {file_hash}")
+            print(f"    DB IDs: {ids}")
+            print()
+    else:
+        print("✓ Duplikaate ei leitud!")
+
+    cur.close()
+    conn.close()
+
+if __name__ == "__main__":
+    find_duplicates()

+ 212 - 0
src/generate_answer.py

@@ -0,0 +1,212 @@
+# file: src/generate_answer.py
+import os
+import json
+import requests
+from typing import List, Dict
+from dotenv import load_dotenv
+
+load_dotenv()
+
+LLAMA_CPP_URL = os.getenv("LLAMA_CPP_URL", "http://localhost:8070/completion")
+
+def deduplicate_chunks(chunks: List[Dict]) -> List[Dict]:
+    """Eemaldab duplikaadi chunk'id."""
+    seen = set()
+    unique_chunks = []
+    
+    for chunk in chunks:
+        key = (chunk['filename'], chunk['page'], chunk['text'][:50])
+        if key not in seen:
+            seen.add(key)
+            unique_chunks.append(chunk)
+    
+    return unique_chunks
+
+def build_context(articles: List[Dict], chunks: List[Dict], max_chars: int = 8000) -> str:
+    """
+    Suurendatud kontekst. Weaviate andmete osa oluliselt suurem.
+    """
+    context_parts = []
+    char_count = 0
+    
+    unique_chunks = deduplicate_chunks(chunks)
+    
+    # SUURENDATUD: Weaviate artikli metaandmed
+    context_parts.append("=== ARTICLE SUMMARIES FROM WEAVIATE ===\n")
+    for i, art in enumerate(articles[:8], 1):  # 5 → 8 artiklit!
+        
+        # Koosta rikkalik artikli info
+        authors_str = ', '.join(art['authors'][:3]) if art['authors'] else 'N/A'
+        keywords_str = ', '.join(art['key_concepts'][:6]) if art['key_concepts'] else 'N/A'
+        methods_str = ', '.join(art['methods_used'][:4]) if art.get('methods_used') else 'N/A'
+        
+        # Parsime transport_context (JSON string)
+        transport_str = "N/A"
+        try:
+            if art.get('transport_context'):
+                import json
+                tc = json.loads(art['transport_context'])
+                theoretical = tc.get('theoretical_contribution', '')
+                practical = tc.get('practical_applicability', '')
+                transport_str = f"Theoretical: {theoretical[:200]}... Practical: {practical[:200]}..."
+        except:
+            transport_str = art.get('transport_context', '')[:300]
+        
+        article_text = f"""[Article {i}] {art['title']}
+Authors: {authors_str}
+Year: {art.get('year', 'N/A')}
+Keywords: {keywords_str}
+Methods Used: {methods_str}
+Transport Context: {transport_str}
+Summary (Estonian): {art['summary_et'][:900]}  ← 350 → 900 märki!
+
+"""
+        context_parts.append(article_text)
+        char_count += len(article_text)
+        
+        if char_count > max_chars * 0.45:  # 45% artiklitele
+            break
+    
+    # pgvector excerpts (sama mis enne)
+    context_parts.append("\n=== TEXT EXCERPTS FROM SOURCES ===\n")
+    for i, chunk in enumerate(unique_chunks[:15], 1):
+        chunk_text = f"""[Excerpt {i}] ({chunk['filename']}, page {chunk['page']})
+{chunk['text'][:280]}
+
+"""
+        context_parts.append(chunk_text)
+        char_count += len(chunk_text)
+        
+        if char_count > max_chars:
+            break
+    
+    return "\n".join(context_parts)
+
+def generate_answer(query: str, context: str, temperature: float = 0.6, max_tokens: int = 1500) -> str:
+    """
+    Genereerib vastuse. Sama kui enne, aga kontekst on rikkam.
+    """
+    
+    prompt = f"""You are an expert in scientific research on transportation safety and traffic engineering.
+
+CONTEXT (from scientific sources in Weaviate database):
+{context}
+
+USER QUESTION (in Estonian):
+{query}
+
+INSTRUCTIONS:
+- Answer in fluent Estonian language
+- Use data and findings from the sources provided above
+- Cite articles or excerpts by number in square brackets (e.g., "[Article 1]" or "[Excerpt 3]")
+- Be concrete, detailed, and factual
+- Write at least 250 words (important: use the context fully)
+- Do NOT speculate or add information not present in the context
+- Reference the methods, findings, and practical applications mentioned in the articles
+- Structure your answer clearly with paragraphs
+
+ANSWER (in Estonian):"""
+
+    payload = {
+        "prompt": prompt,
+        "n_predict": max_tokens,
+        "temperature": temperature,
+        "top_p": 0.95,
+        "repeat_penalty": 1.2,
+        "presence_penalty": 0.1,
+        "frequency_penalty": 0.1,
+    }
+
+    print(f"📤 Saandan päring llama.cpp'le...")
+    print(f"   📊 Konteksti suurus: {len(context)} märki ≈ {len(context)//4} tokenit")
+    
+    try:
+        response = requests.post(LLAMA_CPP_URL, json=payload, timeout=240)
+        response.raise_for_status()
+        result = response.json()
+        
+        answer = result.get("content", "").strip()
+        tokens_predicted = result.get("tokens_predicted", 0)
+        
+        print(f"   ✅ Genereeriud: {tokens_predicted} tokenit, {len(answer.split())} sõna")
+        
+        if not answer:
+            return "⚠️ LLM tagastas tühja vastuse."
+        
+        word_count = len(answer.split())
+        if word_count < 100:
+            print(f"   ⚠️  Liiga lühike ({word_count} sõna) – konteksti ei piisa")
+            return f"⚠️ LLM vastus on liiga lühike ({word_count} sõna):\n\n{answer}"
+        
+        return answer
+        
+    except Exception as e:
+        return f"❌ Viga: {e}"
+
+def rag_query(query: str, articles: List[Dict], chunks: List[Dict], 
+              temperature: float = 0.6, max_tokens: int = 1500) -> Dict:
+    """RAG tsükkel."""
+    print(f"\n{'='*70}")
+    print(f"🔍 RAG PÄRING")
+    print(f"{'='*70}")
+    print(f"Küsimus: {query}")
+    print(f"Artikleid: {len(articles)}, Chunk'e: {len(chunks)}")
+    
+    context = build_context(articles, chunks, max_chars=8000)  # 5000 → 8000
+    answer = generate_answer(query, context, temperature=temperature, max_tokens=max_tokens)
+    
+    # DEBUG: salvesta kontekst
+    with open("/tmp/rag_context_debug.txt", "w") as f:
+        f.write(context)
+    
+    return {
+        "query": query,
+        "answer": answer,
+        "sources": {
+            "articles": [
+                {
+                    "title": a["title"], 
+                    "authors": a.get("authors", [])[:3],
+                    "score": a.get("score", 0)
+                } 
+                for a in articles[:8]  # 5 → 8
+            ],
+            "chunks_used": len(set((c['filename'], c['page']) for c in chunks[:15]))
+        },
+        "parameters": {
+            "context_chars": len(context),
+            "context_tokens_approx": len(context) // 4,
+            "context_weaviate_chars": len("\n".join([a['summary_et'][:900] for a in articles[:8]])),
+        }
+    }
+
+if __name__ == "__main__":
+    from src.query_hybrid import hybrid_search
+    
+    query = "young driver accident risk"
+    query = "traffic flow"
+    query = "road safety"
+    print(f"\n🚀 Käivitan hübriidotsingu...")
+    
+    results = hybrid_search(query, top_articles=10, top_chunks=20)
+    
+    print(f"\n🤖 Genereerin RAG vastust...")
+    rag_result = rag_query(
+        query=results["query"],
+        articles=results["articles"],
+        chunks=results["chunks"],
+        temperature=0.6,
+        max_tokens=1500
+    )
+    
+    print("\n" + "=" * 70)
+    print("🎯 RAG VASTUS:")
+    print("=" * 70)
+    print(rag_result["answer"])
+    
+    print("\n" + "=" * 70)
+    print("📊 STATISTIKA:")
+    print("=" * 70)
+    print(f"Weaviate konteksti: {rag_result['parameters']['context_weaviate_chars']} märki")
+    print(f"Kokku konteksti: {rag_result['parameters']['context_chars']} märki")
+    print(f"Kokku tokenit: ~{rag_result['parameters']['context_tokens_approx']}")

+ 214 - 0
src/query_hybrid.py

@@ -0,0 +1,214 @@
+# file: src/query_hybrid.py
+import os
+from dotenv import load_dotenv
+import psycopg2
+from weaviate import WeaviateClient
+from weaviate.connect import ConnectionParams
+from src.embed_utils import get_embedding
+import json
+
+load_dotenv()
+
+DB_HOST = os.getenv("DB_HOST", "localhost")
+DB_PORT = os.getenv("DB_PORT", "5432")
+DB_NAME = os.getenv("DB_NAME", "pdf_research")
+DB_USER = os.getenv("DB_USER", "pdf_user")
+DB_PASSWORD = os.getenv("DB_PASSWORD")
+
+WEAVIATE_HOST = os.getenv("WEAVIATE_HOST", "localhost")
+WEAVIATE_HTTP_PORT = int(os.getenv("WEAVIATE_HTTP_PORT", "8080"))
+WEAVIATE_CLASS = os.getenv("WEAVIATE_CLASS", "ScientificArticle")
+
+def get_db_conn():
+    return psycopg2.connect(
+        host=DB_HOST,
+        port=DB_PORT,
+        database=DB_NAME,
+        user=DB_USER,
+        password=DB_PASSWORD,
+    )
+
+def get_weaviate_client() -> WeaviateClient:
+    """Weaviate v4 klient - HTTP ainult"""
+    client = WeaviateClient(connection_params=ConnectionParams.from_params(
+        http_host=WEAVIATE_HOST,
+        http_port=WEAVIATE_HTTP_PORT,
+        http_secure=False,
+        grpc_host=WEAVIATE_HOST,
+        grpc_port=50051,
+        grpc_secure=False,
+    ))
+    client.connect()
+    return client
+
+def search_weaviate_articles_all(limit: int = 10000) -> list[dict]:
+    """
+    Loeb KÕIK artiklid Weaviate'st (fetch_objects),
+    ei kasuta near_text (mis vajavad Ollama embedding'uid).
+    """
+    client = get_weaviate_client()
+
+    try:
+        collection = client.collections.get(WEAVIATE_CLASS)
+        
+        # Loeme kõik objektid ilma otsinguta
+        results = collection.query.fetch_objects(
+            limit=limit,
+            return_properties=["title", "source_file", "summary_et", "key_concepts", 
+                             "authors", "transport_context", "relevance_score", "abstract_en"]
+        )
+
+        articles = []
+        for obj in results.objects:
+            props = obj.properties
+            articles.append({
+                "title": props.get("title", "N/A"),
+                "article_id": str(obj.uuid),
+                "summary_et": (props.get("summary_et", "") or "")[:500],
+                "key_concepts": props.get("key_concepts", []),
+                "authors": props.get("authors", []),
+                "transport_context": props.get("transport_context", ""),
+                "relevance_score": props.get("relevance_score", 0),
+                "abstract_en": (props.get("abstract_en", "") or "")[:500],
+                "source_file": props.get("source_file", ""),
+            })
+
+        return articles
+
+    except Exception as e:
+        print(f"❌ Weaviate päringu viga: {e}")
+        import traceback
+        traceback.print_exc()
+        return []
+    finally:
+        client.close()
+
+def search_articles_by_query(query: str, all_articles: list[dict], limit: int = 10) -> list[dict]:
+    """
+    Otsib artikleid query abil, kasutades lokaalseid embeddings'eid (bge-small).
+    Arvutab sarnasuse vektoritega.
+    """
+    query_emb = get_embedding(query)
+    
+    # Otsime artiklite summary_et ja abstract_en seast
+    scored_articles = []
+    
+    for article in all_articles:
+        # Võta tekst (summary_et või abstract_en)
+        text = article.get("summary_et") or article.get("abstract_en") or ""
+        
+        if not text:
+            continue
+        
+        # Arvuta embedding
+        text_emb = get_embedding(text)
+        
+        # Arvuta cosine similarity (dot product normaliseeritud vektorite jaoks)
+        similarity = sum(a * b for a, b in zip(query_emb, text_emb))
+        
+        scored_articles.append({
+            **article,
+            "score": similarity
+        })
+    
+    # Sorteeri ja võta top N
+    scored_articles.sort(key=lambda x: x["score"], reverse=True)
+    return scored_articles[:limit]
+
+def search_pgvector_chunks(query: str, article_ids: list[str], limit: int = 20) -> list[dict]:
+    """
+    Otsib pgvectorist chunkid, mis kuuluvad antud artiklitele.
+    """
+    if not article_ids:
+        return []
+
+    conn = get_db_conn()
+    cur = conn.cursor()
+
+    query_emb = get_embedding(query)
+
+    cur.execute("""
+        SELECT 
+            c.id,
+            c.text,
+            c.page,
+            c.chunk_index,
+            r.filename,
+            r.weaviate_article_id,
+            1 - (c.embedding <=> %s::vector) AS similarity
+        FROM chunks c
+        JOIN raw_documents r ON c.raw_doc_id = r.id
+        WHERE r.weaviate_article_id = ANY(%s::UUID[])
+        ORDER BY c.embedding <=> %s::vector
+        LIMIT %s
+    """, (query_emb, article_ids, query_emb, limit))
+
+    chunks = []
+    for row in cur.fetchall():
+        chunks.append({
+            "chunk_id": row[0],
+            "text": row[1],
+            "page": row[2],
+            "chunk_index": row[3],
+            "filename": row[4],
+            "weaviate_article_id": str(row[5]) if row[5] else None,
+            "similarity": float(row[6]),
+        })
+
+    cur.close()
+    conn.close()
+
+    return chunks
+
+def hybrid_search(query: str, top_articles: int = 10, top_chunks: int = 20) -> dict:
+    """
+    Kaheastmeline otsing:
+    1. Leia top artikleid Weaviate'st (lokaalsete embeddings'idega)
+    2. Leia nende artiklite chunkide seast parimad pgvectorist
+    """
+    print(f"\n🔍 Hübriidotsing: '{query}'\n")
+
+    # Samm 1: Loe kõik artiklid Weaviate'st
+    print("Samm 1: Loeme artikleid Weaviate'st...")
+    all_articles = search_weaviate_articles_all(limit=10000)
+    print(f"✓ Leitud {len(all_articles)} artiklit kokku")
+
+    # Samm 2: Otsime query abil (lokaalsed embeddings)
+    print(f"Samm 2: Otsing ({len(all_articles)} artikli seast)...")
+    articles = search_articles_by_query(query, all_articles, limit=top_articles)
+    print(f"✓ Leitud {len(articles)} asjakohast artiklit")
+
+    if articles:
+        for i, art in enumerate(articles, 1):
+            print(f"  {i}. {art['title'][:60]}...")
+            print(f"     Relevants: {art['relevance_score']:.1f} | Score: {art['score']:.3f}")
+
+    # Samm 3: pgvector chunk otsing
+    print(f"\nSamm 3: Otsing pgvector'ist (chunk tase)...")
+    article_ids = [a["article_id"] for a in articles if a["article_id"]]
+    
+    if article_ids:
+        chunks = search_pgvector_chunks(query, article_ids, limit=top_chunks)
+        print(f"✓ Leitud {len(chunks)} chunk'i")
+
+        if chunks:
+            for i, chunk in enumerate(chunks[:5], 1):
+                print(f"  {i}. {chunk['filename']} (leht {chunk['page']}, sim: {chunk['similarity']:.3f})")
+                print(f"     \"{chunk['text'][:100]}...\"")
+    else:
+        chunks = []
+        print(f"⚠️ Ühtegi artiklit ei leitud, chunkide otsingut ei saa teha")
+
+    return {
+        "query": query,
+        "articles": articles,
+        "chunks": chunks,
+    }
+
+if __name__ == "__main__":
+    #results = hybrid_search("young driver accident risk", top_articles=10, top_chunks=20)
+    results = hybrid_search("modelling traffic volume in rural areas", top_articles=10, top_chunks=20)
+
+    print("\n" + "=" * 60)
+    print("TÄIELIK TULEMUS:")
+    print(json.dumps(results, default=str, indent=2, ensure_ascii=False)[:3000])

+ 276 - 0
src/rag_api.py

@@ -0,0 +1,276 @@
+# file: src/rag_api.py
+"""
+RAG API - FastAPI server teadusartiklite otsingule ja LLM vastusetele.
+Variant A: Weaviate (metaandmed) + pgvector (chunkid) + unsloth LLM (vastused)
+"""
+
+import os
+import json
+import logging
+from typing import List, Optional
+from datetime import datetime
+
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.responses import JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+
+from dotenv import load_dotenv
+
+from src.query_hybrid import hybrid_search
+from src.generate_answer import rag_query
+
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+# FastAPI app
+app = FastAPI(
+    title="RAG API - Transportation Safety Research",
+    description="Hübriidotsing teadusartiklites + LLM vastuste genereerimine",
+    version="1.0.0"
+)
+
+# CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Pydantic models
+class RAGRequest(BaseModel):
+    """RAG päringule sissetulek"""
+    query: str = Query(..., min_length=5, max_length=500, description="Küsimus eesti keeles")
+    top_articles: int = Query(10, ge=1, le=20, description="Mitu artiklit otsitakse (1-20)")
+    top_chunks: int = Query(20, ge=1, le=50, description="Mitu chunki otsitakse (1-50)")
+    temperature: float = Query(0.6, ge=0.1, le=1.0, description="LLM temperature (0.1-1.0)")
+    max_tokens: int = Query(1500, ge=500, le=3000, description="Max tokens vastusele (500-3000)")
+
+class Article(BaseModel):
+    """Artikli metaandmed"""
+    title: str
+    article_id: str
+    authors: List[str]
+    relevance_score: float
+    score: float
+
+class Chunk(BaseModel):
+    """Tekstilõik"""
+    chunk_id: int
+    filename: str
+    page: int
+    text: str
+    similarity: float
+
+class RAGResponse(BaseModel):
+    """RAG vastuse struktuur"""
+    query: str
+    answer: str
+    articles: List[Article]
+    chunks: List[Chunk]
+    parameters: dict
+    timestamp: str
+    success: bool
+
+# API endpoints
+
+@app.get("/", tags=["Info"])
+async def root():
+    """RAG API juurepunkt"""
+    return {
+        "name": "RAG API - Transportation Safety",
+        "version": "1.0.0",
+        "endpoints": {
+            "search": "/search",
+            "health": "/health",
+            "docs": "/docs"
+        }
+    }
+
+@app.get("/health", tags=["Info"])
+async def health():
+    """Süsteemi tervisekontroll"""
+    return {
+        "status": "healthy",
+        "timestamp": datetime.now().isoformat(),
+        "version": "1.0.0"
+    }
+
+@app.post("/search", response_model=RAGResponse, tags=["RAG"])
+async def search_rag(request: RAGRequest) -> RAGResponse:
+    """
+    RAG otsing: hübriidotsing + LLM vastuse genereerimine
+    
+    **Protsess:**
+    1. Hübriidotsing (Weaviate + pgvector)
+    2. LLM vastuse genereerimine kontekstiga
+    3. Vastuse tagastamine JSON-na
+    
+    **Näide:**
+    ```json
+    {
+        "query": "young driver accident risk",
+        "top_articles": 10,
+        "top_chunks": 20,
+        "temperature": 0.6,
+        "max_tokens": 1500
+    }
+    ```
+    """
+    
+    try:
+        logger.info(f"🔍 RAG päring: '{request.query}'")
+        
+        # Samm 1: Hübriidotsing
+        logger.info("📚 Hübriidotsing (Weaviate + pgvector)...")
+        hybrid_results = hybrid_search(
+            query=request.query,
+            top_articles=request.top_articles,
+            top_chunks=request.top_chunks
+        )
+        
+        # Samm 2: LLM vastuse genereerimine
+        logger.info("🤖 LLM vastuse genereerimine...")
+        rag_results = rag_query(
+            query=hybrid_results["query"],
+            articles=hybrid_results["articles"],
+            chunks=hybrid_results["chunks"],
+            temperature=request.temperature,
+            max_tokens=request.max_tokens
+        )
+        
+        # Samm 3: Vastuse formateerimine
+        logger.info("✅ Vastus valmis")
+        
+        # Konverteeri artiklid ja chunkid modellideks
+        articles = [
+            Article(
+                title=a["title"],
+                article_id=a["article_id"],
+                authors=a.get("authors", [])[:3],
+                relevance_score=a.get("relevance_score", 0),
+                score=a.get("score", 0)
+            )
+            for a in hybrid_results["articles"][:8]
+        ]
+        
+        chunks = [
+            Chunk(
+                chunk_id=c["chunk_id"],
+                filename=c["filename"],
+                page=c["page"],
+                text=c["text"][:500],  # Lühendatud tekst JSON jaoks
+                similarity=c["similarity"]
+            )
+            for c in hybrid_results["chunks"][:10]
+        ]
+        
+        response = RAGResponse(
+            query=request.query,
+            answer=rag_results["answer"],
+            articles=articles,
+            chunks=chunks,
+            parameters=rag_results["parameters"],
+            timestamp=datetime.now().isoformat(),
+            success=True
+        )
+        
+        return response
+        
+    except Exception as e:
+        logger.error(f"❌ Viga: {str(e)}")
+        raise HTTPException(
+            status_code=500,
+            detail={
+                "error": str(e),
+                "query": request.query,
+                "timestamp": datetime.now().isoformat()
+            }
+        )
+
+@app.get("/search-simple", tags=["RAG"])
+async def search_simple(
+    q: str = Query(..., min_length=5, max_length=500, description="Küsimus")
+):
+    """
+    Lihtsustatud RAG otsing (vaikimisi parameetritega)
+    
+    **Näide:** `/search-simple?q=young%20driver%20risk`
+    """
+    
+    try:
+        request = RAGRequest(
+            query=q,
+            top_articles=10,
+            top_chunks=20,
+            temperature=0.6,
+            max_tokens=1500
+        )
+        
+        return await search_rag(request)
+        
+    except Exception as e:
+        raise HTTPException(
+            status_code=400,
+            detail=str(e)
+        )
+
+@app.get("/articles", tags=["Info"])
+async def list_articles(
+    limit: int = Query(10, ge=1, le=100),
+    skip: int = Query(0, ge=0)
+):
+    """
+    Artiklite loend (metadata)
+    """
+    
+    try:
+        from src.query_hybrid import search_weaviate_articles_all
+        
+        articles = search_weaviate_articles_all(limit=limit + skip)
+        articles = articles[skip:skip + limit]
+        
+        return {
+            "total": len(articles),
+            "articles": [
+                {
+                    "title": a["title"],
+                    "authors": a.get("authors", [])[:3],
+                    "relevance_score": a.get("relevance_score", 0),
+                    "year": a.get("year", "N/A")
+                }
+                for a in articles
+            ]
+        }
+        
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+
+# Error handlers
+
+@app.exception_handler(HTTPException)
+async def http_exception_handler(request, exc):
+    """Custom HTTP exception handler"""
+    return JSONResponse(
+        status_code=exc.status_code,
+        content={
+            "error": exc.detail,
+            "timestamp": datetime.now().isoformat()
+        }
+    )
+
+if __name__ == "__main__":
+    import uvicorn
+    
+    # Käivita FastAPI server
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=8071,
+        log_level="info"
+    )

+ 135 - 0
src/sync_weaviate.py

@@ -0,0 +1,135 @@
+# file: src/sync_weaviate.py
+# file: src/sync_weaviate.py
+import os
+import uuid
+from pathlib import Path
+from dotenv import load_dotenv
+import psycopg2
+from weaviate import WeaviateClient
+from weaviate.connect import ConnectionParams
+
+load_dotenv()
+
+DB_HOST = os.getenv("DB_HOST", "localhost")
+DB_PORT = os.getenv("DB_PORT", "5432")
+DB_NAME = os.getenv("DB_NAME", "pdf_research")
+DB_USER = os.getenv("DB_USER", "pdf_user")
+DB_PASSWORD = os.getenv("DB_PASSWORD")
+
+# Weaviate - kasuta tegelikku aadressit
+WEAVIATE_HOST = os.getenv("WEAVIATE_HOST", "localhost")
+WEAVIATE_HTTP_PORT = int(os.getenv("WEAVIATE_HTTP_PORT", "8080"))
+WEAVIATE_CLASS = os.getenv("WEAVIATE_CLASS", "ScientificArticle")
+
+def get_db_conn():
+    return psycopg2.connect(
+        host=DB_HOST,
+        port=DB_PORT,
+        database=DB_NAME,
+        user=DB_USER,
+        password=DB_PASSWORD,
+    )
+
+def get_weaviate_client() -> WeaviateClient:
+    """Weaviate v4 klient - HTTP ainult"""
+    client = WeaviateClient(connection_params=ConnectionParams.from_params(
+        http_host=WEAVIATE_HOST,
+        http_port=WEAVIATE_HTTP_PORT,
+        http_secure=False,
+        grpc_host=WEAVIATE_HOST,
+        grpc_port=50051,  # vaikimisi gRPC port, ei pea tööle olema
+        grpc_secure=False,
+    ))
+    client.connect()
+    print(f"✓ Ühendatud Weaviate'ga: {WEAVIATE_HOST}:{WEAVIATE_HTTP_PORT}")
+    return client
+
+def sync_weaviate_to_postgres():
+    """
+    Loeb Weaviate'st kõik ScientificArticle objektid,
+    otsib vastavad raw_documents read (source_file järgi),
+    ja uuendab weaviate_article_id veeru.
+    """
+    print("Connecting to Weaviate...")
+    client = get_weaviate_client()
+    
+    try:
+        is_ready = client.is_ready()
+        print(f"✓ Weaviate on valmis: {is_ready}")
+    except Exception as e:
+        print(f"❌ Weaviate ühenduse viga: {e}")
+        client.close()
+        return
+
+    print(f"Fetching articles from Weaviate (klass: {WEAVIATE_CLASS})...")
+    
+    try:
+        # Weaviate v4: fetch_objects ilma filtrita
+        collection = client.collections.get(WEAVIATE_CLASS)
+        
+        # Fetch all objects - kasuta suurt limitti
+        results = collection.query.fetch_objects(
+            limit=10000,
+            return_properties=["source_file", "title"]
+        )
+        
+        print(f"✓ Leitud {len(results.objects)} artiklit Weaviate'st")
+    except Exception as e:
+        print(f"❌ Weaviate päringu viga: {e}")
+        import traceback
+        traceback.print_exc()
+        client.close()
+        return
+
+    conn = get_db_conn()
+    cur = conn.cursor()
+
+    synced = 0
+    not_found = 0
+    errors = 0
+
+    for obj in results.objects:
+        try:
+            props = obj.properties
+            source_file = props.get("source_file")
+            title = props.get("title", "N/A")
+            weaviate_id = obj.uuid
+
+            if not source_file or not weaviate_id:
+                print(f"⚠️ Puudub source_file või id: {title}")
+                not_found += 1
+                continue
+
+            weaviate_uuid = str(weaviate_id)
+            filename = Path(source_file).name
+
+            cur.execute(
+                "SELECT id FROM raw_documents WHERE filename = %s",
+                (filename,)
+            )
+            row = cur.fetchone()
+
+            if row:
+                raw_doc_id = row[0]
+                cur.execute(
+                    "UPDATE raw_documents SET weaviate_article_id = %s WHERE id = %s",
+                    (weaviate_uuid, raw_doc_id)
+                )
+                synced += 1
+                print(f"✓ {title[:50]}... (pg_id={raw_doc_id})")
+            else:
+                not_found += 1
+                print(f"⚠️ Ei leitud pgvector'ist: {filename}")
+        except Exception as e:
+            errors += 1
+            print(f"❌ Viga objektiga: {e}")
+
+    conn.commit()
+    cur.close()
+    conn.close()
+    client.close()
+
+    print(f"\n✓ Kokkuvõte: {synced} sünkroniseeritud, {not_found} ei leitud, {errors} viga")
+
+if __name__ == "__main__":
+    sync_weaviate_to_postgres()