vor 4 Monaten · ca37369b2b
--- a/.gitignore
+++ b/.gitignore
@@ -61,4 +61,7 @@ target/
 
				 # Ardo
			
 
				 data/logs
			
 
				 data/pdfs
			
 
				+data/processed
			
 
				+tmp/
			
 
				+.venv/
			
 
				 ..env.swp
			
--- a/LOEMIND.md
+++ b/LOEMIND.md
@@ -0,0 +1,9 @@
 
				+
			
 
				+```bash
			
 
				+sudo apt install python3-ipykernel
			
 
				+cd ~/rag-demo
			
 
				+python3 -m venv .venv
			
 
				+source .venv/bin/activate
			
 
				+pip install -r requirements.txt
			
 
				+pip install -r transpordi_artiklid/requirements.txt
			
 
				+```
			
--- a/LOEMIND_GIT.md
+++ b/LOEMIND_GIT.md
@@ -0,0 +1,96 @@
 
				+# Git Repo Kasutusjuhend
			
 
				+
			
 
				+## Põhitoimingud
			
 
				+
			
 
				+### Algus
			
 
				+```bash
			
 
				+# Klooni repo (kui pole veel tehtud)
			
 
				+git clone https://gots.odamus.com/weaviate/transpordi_artiklid.git
			
 
				+cd transpordi_artiklid
			
 
				+```
			
 
				+
			
 
				+### Igapäevased toimingud
			
 
				+
			
 
				+1. **Enne tööd alustamist uuenda kohalikku haru:**
			
 
				+```bash
			
 
				+git pull origin master
			
 
				+```
			
 
				+
			
 
				+2. **Vaata muudatusi:**
			
 
				+```bash
			
 
				+git status
			
 
				+git diff
			
 
				+```
			
 
				+
			
 
				+3. **Lisa muudatused:**
			
 
				+```bash
			
 
				+# Lisa konkreetne fail
			
 
				+git add <faili_nimi>
			
 
				+
			
 
				+# Lisa kõik muudatused
			
 
				+git add .
			
 
				+
			
 
				+# Lisa osaliselt (interaktiivne)
			
 
				+git add -p
			
 
				+```
			
 
				+
			
 
				+4. **Loo commit:**
			
 
				+```bash
			
 
				+git commit -m "Selgitav sõnum"
			
 
				+```
			
 
				+
			
 
				+5. **Saada muudatused serverisse:**
			
 
				+```bash
			
 
				+git push origin master
			
 
				+```
			
 
				+
			
 
				+### Harude (branches) kasutamine
			
 
				+
			
 
				+```bash
			
 
				+# Uue haru loomine
			
 
				+git checkout -b uus-haru
			
 
				+
			
 
				+# Haru vahetamine
			
 
				+git checkout haru-nimi
			
 
				+
			
 
				+# Haru ühendamine masterisse
			
 
				+git checkout master
			
 
				+git merge uus-haru
			
 
				+git push origin master
			
 
				+```
			
 
				+
			
 
				+## Olulised nõuanded
			
 
				+
			
 
				+### 1. **Enne push'i alati pull**
			
 
				+Alati tee `git pull` enne kui hakkad tööle, et vältida konflikte.
			
 
				+
			
 
				+### 2. **Kommenteeri hästi**
			
 
				+Commit sõnumid peaksid olema selged ja kirjeldavad.
			
 
				+
			
 
				+### 3. **Ära push'i suuri faile**
			
 
				+Git pole mõeldud piltide, videote või muude suurte failide jaoks. Kasuta .gitignore faili.
			
 
				+
			
 
				+### 4. **Tööta harudes**
			
 
				+Suurte muudatuste tegemisel loo uus haru, et mitte segada peaharuga.
			
 
				+
			
 
				+### 5. **Salvesta tihti**
			
 
				+Tee commite tihti - parem palju väikeseid commite kui üks suur.
			
 
				+
			
 
				+## Abikäsud
			
 
				+
			
 
				+```bash
			
 
				+# Ajalugu vaatamine
			
 
				+git log --oneline
			
 
				+
			
 
				+# Muudatuste tagasivõtmine
			
 
				+git checkout -- <fail>
			
 
				+
			
 
				+# Viimase commit'i muutmine
			
 
				+git commit --amend
			
 
				+```
			
 
				+
			
 
				+## Seadistus
			
 
				+Sinu repo on seadistatud jälgima kaugrepot `origin` aadressil:
			
 
				+`https://gogs.odamus.com/weaviate/transpordi_artiklid.git`
			
 
				+
			
 
				+Peaharu (`master`) on seotud kaugrepo master haruga.
			
--- a/README.md
+++ b/README.md
@@ -14,6 +14,6 @@ Süsteem teadusartiklite automaatseks töötlemiseks, analüüsiks ja salvestami
 
				 

			
 
				 1. Klooni repository:

			
 
				 ```bash

			
 
				-git clone [repository-url]

			
 
				+git clone https://gots.odamus.com/weaviate/transpordi_artiklid.git

			
 
				 cd transpordi_artiklid

			
 
				 ```

			
--- a/data/processed/article_20251229_093529.json
+++ b/data/processed/article_20251229_093529.json
--- a/data/processed/article_20251229_093733.json
+++ b/data/processed/article_20251229_093733.json
--- a/data/processed/article_20251229_093944.json
+++ b/data/processed/article_20251229_093944.json
--- a/data/processed/article_20251229_094153.json
+++ b/data/processed/article_20251229_094153.json
--- a/data/processed/article_20251229_094407.json
+++ b/data/processed/article_20251229_094407.json
--- a/jupyter/WeaviateExportImport.ipynb
+++ b/jupyter/WeaviateExportImport.ipynb
--- a/jupyter/weaviate_export_import_clean.py
+++ b/jupyter/weaviate_export_import_clean.py
@@ -0,0 +1,219 @@
 
				+"""

			
 
				+Weaviate Collection Export/Import Utility

			
 
				+

			
 
				+Korduvkasutatav tööriist Weaviate kollektsioonide eksportimiseks ja importimiseks JSON backup failide kaudu.

			
 
				+Toetab UUID normaliseerimist, int/float tüüpe, doc_hash, vigu ja batch operatsioone.

			
 
				+"""

			
 
				+import datetime

			
 
				+import json

			
 
				+import uuid

			
 
				+import logging

			
 
				+from pathlib import Path

			
 
				+from typing import Dict, List, Any, Optional, Union

			
 
				+from weaviate import WeaviateClient, ConnectionParams

			
 
				+from weaviate.classes.config import Property, DataType

			
 
				+from decimal import Decimal

			
 
				+import ijson

			
 
				+

			
 
				+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

			
 
				+logger = logging.getLogger(__name__)

			
 
				+

			
 
				+class WeaviateExportImport:

			
 
				+    """Korduvkasutatav klass Weaviate kollektsioonide eksportimiseks ja importimiseks."""

			
 
				+

			
 
				+    def __init__(self, src_client: Optional[WeaviateClient] = None, dst_client: Optional[WeaviateClient] = None):

			
 
				+        self.src_client = src_client

			
 
				+        self.dst_client = dst_client

			
 
				+

			
 
				+    @staticmethod

			
 
				+    def create_client(host: str, http_port: int = 9020, grpc_port: int = 50051, secure: bool = False) -> WeaviateClient:

			
 
				+        client = WeaviateClient(connection_params=ConnectionParams.from_params(

			
 
				+            http_host=host,

			
 
				+            http_port=http_port,

			
 
				+            http_secure=secure,

			
 
				+            grpc_host=host,

			
 
				+            grpc_port=grpc_port,

			
 
				+            grpc_secure=secure,

			
 
				+        ))

			
 
				+        client.connect()

			
 
				+        logger.info(f"Ühendatud Weaviate'ga: {host}:{http_port}")

			
 
				+        return client

			
 
				+

			
 
				+

			
 
				+    def normalize_int_fields(self, props: Dict[str, Any], int_fields: List[str] = None) -> Dict[str, Any]:

			
 
				+        if int_fields is None:

			
 
				+            int_fields = ["page_start", "page_end", "chunk"]

			
 
				+

			
 
				+        for field in int_fields:

			
 
				+            if field in props:

			
 
				+                value = props[field]

			
 
				+                # Kui väärtus on float, Decimal või int-tüüp

			
 
				+                if isinstance(value, float) and value.is_integer():

			
 
				+                    props[field] = int(value)

			
 
				+                elif isinstance(value, Decimal):

			
 
				+                    # kasuta kas int() või float(), olenevalt kontekstist

			
 
				+                    props[field] = int(value) if value % 1 == 0 else float(value)

			
 
				+        return props

			
 
				+

			
 
				+

			
 
				+    def normalize_doc_hash(self, doc_hash: Any) -> str:

			
 
				+        if isinstance(doc_hash, uuid.UUID):

			
 
				+            return doc_hash.hex

			
 
				+        if isinstance(doc_hash, str) and len(doc_hash) == 36 and "-" in doc_hash:

			
 
				+            return doc_hash.replace("-", "")

			
 
				+        return str(doc_hash)

			
 
				+

			
 
				+    def clean_uuid(self, obj: Any) -> Any:

			
 
				+        if isinstance(obj, dict):

			
 
				+            return {k: self.clean_uuid(v) for k, v in obj.items()}

			
 
				+        if isinstance(obj, (list, tuple)):

			
 
				+            return [self.clean_uuid(x) for x in obj]

			
 
				+        if isinstance(obj, uuid.UUID):

			
 
				+            return str(obj)

			
 
				+        if hasattr(obj, "__str__") and obj.__class__.__name__.lower().startswith("uuid"):

			
 
				+            return str(obj)

			
 
				+        return obj

			
 
				+

			
 
				+    def process_properties(self, props: Dict[str, Any], int_fields: List[str] = None, hash_fields: List[str] = None) -> Dict[str, Any]:

			
 
				+        if hash_fields is None:

			
 
				+            hash_fields = ["doc_hash"]

			
 
				+        props = self.clean_uuid(props)

			
 
				+        props = self.normalize_int_fields(props, int_fields)

			
 
				+        for field in hash_fields:

			
 
				+            if field in props:

			
 
				+                props[field] = self.normalize_doc_hash(props[field])

			
 
				+        return props

			
 
				+

			
 
				+    def export_collection(self, collection_name: str, output_file: Union[str, Path], 

			
 
				+                                    int_fields: List[str] = None, hash_fields: List[str] = None, 

			
 
				+                                    include_vectors: bool = True) -> int:

			
 
				+        if not self.src_client:

			
 
				+            raise ValueError("Source client pole määratud")

			
 
				+        

			
 
				+        logger.info(f"Alustan kollektsiooni '{collection_name}' streaming eksporti...")

			
 
				+        collection = self.src_client.collections.get(collection_name)

			
 
				+        output_path = Path(output_file)

			
 
				+        count = 0

			
 
				+        

			
 
				+        def custom_json_encoder(obj):

			
 
				+            if isinstance(obj, datetime.datetime):

			
 
				+                return obj.isoformat()

			
 
				+            return str(obj)

			
 
				+        

			
 
				+        # Kirjuta otse faili, mitte mällu

			
 
				+        with open(output_path, "w", encoding="utf-8") as f:

			
 
				+            f.write("[\n")  # alusta JSON array

			
 
				+            first = True

			
 
				+            

			
 
				+            for item in collection.iterator(include_vector=include_vectors):

			
 
				+                props = self.process_properties(dict(item.properties), 

			
 
				+                                            int_fields=int_fields, 

			
 
				+                                            hash_fields=hash_fields)

			
 
				+                export_obj = {

			
 
				+                    'uuid': str(item.uuid),

			
 
				+                    'properties': props,

			
 
				+                }

			
 
				+                

			
 
				+                if include_vectors:

			
 
				+                    export_obj['vector'] = item.vector

			
 
				+                

			
 
				+                # Kirjuta objekt otse faili

			
 
				+                if not first:

			
 
				+                    f.write(",\n")

			
 
				+                json.dump(export_obj, f, ensure_ascii=False, default=custom_json_encoder)

			
 
				+                first = False

			
 
				+                count += 1

			
 
				+                

			
 
				+                # Progress log iga 1000 objekti järel

			
 
				+                if count % 1000 == 0:

			
 
				+                    logger.info(f"Eksporditud: {count} objekti...")

			
 
				+            

			
 
				+            f.write("\n]")  # lõpeta JSON array

			
 
				+        

			
 
				+        logger.info(f"Eksport valmis: {count} objekti")

			
 
				+        return count

			
 
				+    

			
 
				+    def clean_decimals(self, obj: Any) -> Any:

			
 
				+        '''Teisenda kõik Decimal objektid float-ideks'''

			
 
				+        if isinstance(obj, Decimal):

			
 
				+            return float(obj)

			
 
				+        if isinstance(obj, dict):

			
 
				+            return {k: self.clean_decimals(v) for k, v in obj.items()}

			
 
				+        if isinstance(obj, list):

			
 
				+            return [self.clean_decimals(item) for item in obj]

			
 
				+        return obj

			
 
				+

			
 
				+    def import_collection(self, collection_name: str, input_file: Union[str, Path],

			
 
				+                        int_fields: List[str] = None, batch_size: int = 100,

			
 
				+                        recreate_collection: bool = False) -> int:

			
 
				+        if not self.dst_client:

			
 
				+            raise ValueError("Destination client pole määratud")

			
 
				+        

			
 
				+        logger.info(f"Alustan kollektsiooni '{collection_name}' streaming importi...")

			
 
				+        collection = self.dst_client.collections.get(collection_name)

			
 
				+        input_path = Path(input_file)

			
 
				+        

			
 
				+        imported_count = 0

			
 
				+        batch = []

			
 
				+        

			
 
				+        # ijson.items loeb faili osade kaupa, mitte kogu faili mällu

			
 
				+        with open(input_path, 'rb') as f:

			
 
				+            # ✅ PARANDUS 1: Lisa use_decimal=False

			
 
				+            for obj in ijson.items(f, 'item', use_float=True):

			
 
				+                try:

			
 
				+                    props = obj["properties"]

			
 
				+                    props = self.clean_decimals(props)

			
 
				+                    if int_fields:

			
 
				+                        props = self.normalize_int_fields(props, int_fields)

			
 
				+                    

			
 
				+                    # ✅ PARANDUS 2: clean_decimals ka vectorile

			
 
				+                    vector = obj.get("vector")

			
 
				+                    if vector is not None:

			
 
				+                        vector = self.clean_decimals(vector)

			
 
				+                    

			
 
				+                    batch.append({

			
 
				+                        'uuid': str(obj["uuid"]),

			
 
				+                        'properties': props,

			
 
				+                        'vector': vector

			
 
				+                    })

			
 
				+                    

			
 
				+                    # Kui batch täis, importi

			
 
				+                    if len(batch) >= batch_size:

			
 
				+                        self._import_batch(collection, batch)

			
 
				+                        imported_count += len(batch)

			
 
				+                        logger.info(f"Imporditud: {imported_count} objekti...")

			
 
				+                        batch = []

			
 
				+                        

			
 
				+                except Exception as e:

			
 
				+                    logger.warning(f"Import error: {e}")

			
 
				+            

			
 
				+            # Importi viimane batch

			
 
				+            if batch:

			
 
				+                self._import_batch(collection, batch)

			
 
				+                imported_count += len(batch)

			
 
				+        

			
 
				+        logger.info(f"Import lõpetatud: {imported_count} objekti")

			
 
				+        return imported_count

			
 
				+

			
 
				+

			
 
				+    def _import_batch(self, collection, batch):

			
 
				+        '''Batch import helper'''

			
 
				+        for item in batch:

			
 
				+            try:

			
 
				+                collection.data.insert(

			
 
				+                    properties=item['properties'],

			
 
				+                    uuid=item['uuid'],

			
 
				+                    vector=item.get('vector')

			
 
				+                )

			
 
				+            except Exception as e:

			
 
				+                if "already exists" not in str(e):

			
 
				+                    logger.warning(f"Insert error: {e}")

			
 
				+

			
 
				+

			
 
				+    def close_clients(self):

			
 
				+        if self.src_client:

			
 
				+            self.src_client.close()

			
 
				+            logger.info("Source client suletud")

			
 
				+        if self.dst_client:

			
 
				+            self.dst_client.close()

			
 
				+            logger.info("Destination client suletud")

			
--- a/recreate_schema.py
+++ b/recreate_schema.py
@@ -0,0 +1,44 @@
 
				+#!/usr/bin/env python3
			
 
				+"""
			
 
				+Loo Weaviate'i klass uuesti ilma vektoriseerimismoodulita
			
 
				+"""
			
 
				+
			
 
				+import sys
			
 
				+import os
			
 
				+import json
			
 
				+
			
 
				+# Lisa src kaust Pythoni teele
			
 
				+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
			
 
				+
			
 
				+from src.weaviate_client import WeaviateClient
			
 
				+from src.utils import setup_logging
			
 
				+
			
 
				+logger = setup_logging()
			
 
				+
			
 
				+def recreate_schema():
			
 
				+    """Kustuta ja loo klass uuesti"""
			
 
				+    try:
			
 
				+        weaviate_client = WeaviateClient()
			
 
				+        
			
 
				+        # Kustuta olemasolev klass (kui on)
			
 
				+        try:
			
 
				+            weaviate_client.client.schema.delete_class("ScientificArticle")
			
 
				+            logger.info("Klass kustutatud")
			
 
				+        except Exception as e:
			
 
				+            logger.warning(f"Klassi kustutamine ebaõnnestus: {e}")
			
 
				+        
			
 
				+        # Loo klass uuesti (skeemi loomine toimub automaatselt __init__ meetodis)
			
 
				+        # Kuid me peame selle eksplitsiitselt kutsuma
			
 
				+        weaviate_client._setup_schema()
			
 
				+        
			
 
				+        logger.info("Klass uuesti loodud ilma vektoriseerimismoodulita")
			
 
				+        
			
 
				+        weaviate_client.close()
			
 
				+        
			
 
				+    except Exception as e:
			
 
				+        logger.error(f"Viga: {e}")
			
 
				+        import traceback
			
 
				+        logger.error(traceback.format_exc())
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    recreate_schema()
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,4 +5,5 @@ chromadb
 
				 langchain

			
 
				 langchain-community

			
 
				 pypdf
			
 
				-PyPDF2
			
 
				+PyPDF2
			
 
				+jupyter
			
--- a/src/metadata_enhancer.py
+++ b/src/metadata_enhancer.py
@@ -0,0 +1,216 @@
 
				+import re
			
 
				+import json
			
 
				+from typing import Dict, List, Optional
			
 
				+import logging
			
 
				+from .deepseek_client import DeepSeekClient
			
 
				+from .config import config
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+class MetadataEnhancer:
			
 
				+    """Metadata täiustamine DeepSeeki abil"""
			
 
				+    
			
 
				+    def __init__(self):
			
 
				+        self.deepseek_client = DeepSeekClient()
			
 
				+    
			
 
				+    def enhance_metadata_with_ai(self, text: str, current_metadata: Dict) -> Dict:
			
 
				+        """
			
 
				+        Täiusta metainfot DeepSeeki abil
			
 
				+        
			
 
				+        Args:
			
 
				+            text: Artikli tekst (esimesed ~4000 märki)
			
 
				+            current_metadata: Olemasolev metadata
			
 
				+            
			
 
				+        Returns:
			
 
				+            Täiustatud metadata
			
 
				+        """
			
 
				+        logger.info("Täiustan metainfot DeepSeeki abil...")
			
 
				+        
			
 
				+        system_prompt = """Sa oled teadusartiklite metainfo spetsialist. 
			
 
				+Sinu ülesanne on tuvastada antud teadusartikli õige pealkiri, autorid, avaldamisaasta,
			
 
				+žurnaal ja DOI.
			
 
				+
			
 
				+Tagasta vastus JSON formaadis:
			
 
				+{
			
 
				+  "title": "õige pealkiri",
			
 
				+  "authors": ["autor1", "autor2", ...],
			
 
				+  "year": "avaldamisaasta",
			
 
				+  "journal": "žurnaal/konverentsi nimetus",
			
 
				+  "doi": "DOI identifikaator"
			
 
				+}
			
 
				+
			
 
				+Kui mõni väli on tuvastamata, jäta see tühjaks.
			
 
				+Auta valesti tuvastatud väärtusi parandada.
			
 
				+"""
			
 
				+        
			
 
				+        user_prompt = f"""Tuvasta järgmise teadusartikli metainfo:
			
 
				+
			
 
				+CURRENT METADATA:
			
 
				+- Pealkiri: {current_metadata.get('title', 'Teadmata')}
			
 
				+- Autorid: {current_metadata.get('authors', [])}
			
 
				+- Aasta: {current_metadata.get('year', 'Teadmata')}
			
 
				+- Žurnaal: {current_metadata.get('journal', 'Teadmata')}
			
 
				+- DOI: {current_metadata.get('doi', 'Teadmata')}
			
 
				+
			
 
				+ARTIKLI TEKST (esimesed 4000 märki):
			
 
				+{text[:4000]}
			
 
				+
			
 
				+Palun analüüsi artiklit ja paranda või täienda metainfot. Tagasta VAID JSON.
			
 
				+"""
			
 
				+        
			
 
				+        messages = [
			
 
				+            {"role": "system", "content": system_prompt},
			
 
				+            {"role": "user", "content": user_prompt}
			
 
				+        ]
			
 
				+        
			
 
				+        try:
			
 
				+            response = self.deepseek_client.call_api(messages, temperature=0.3)
			
 
				+            
			
 
				+            # Proovi parsida JSON vastust
			
 
				+            if response:
			
 
				+                # Otsi JSON blokki tekstist
			
 
				+                json_match = re.search(r'\{.*\}', response, re.DOTALL)
			
 
				+                if json_match:
			
 
				+                    json_str = json_match.group(0)
			
 
				+                    try:
			
 
				+                        enhanced_data = json.loads(json_str)
			
 
				+                        
			
 
				+                        # Valideeri ja puhasta andmed
			
 
				+                        enhanced_data = self._clean_enhanced_metadata(enhanced_data, current_metadata)
			
 
				+                        
			
 
				+                        logger.info(f"Metainfo täiustatud AI-ga")
			
 
				+                        return enhanced_data
			
 
				+                    except json.JSONDecodeError as e:
			
 
				+                        logger.error(f"JSON parsimise viga: {e}")
			
 
				+                else:
			
 
				+                    logger.error(f"Ei leidnud JSON-i vastuses: {response[:200]}")
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"Viga AI metainfo täiustamisel: {e}")
			
 
				+        
			
 
				+        # Kui AI ei tööta, tagasta algne
			
 
				+        return current_metadata
			
 
				+    
			
 
				+    def _clean_enhanced_metadata(self, enhanced_data: Dict, original_data: Dict) -> Dict:
			
 
				+        """Puhasta ja valideeri täiustatud metadata"""
			
 
				+        cleaned = {}
			
 
				+        
			
 
				+        # Pealkiri
			
 
				+        title = enhanced_data.get('title', '').strip()
			
 
				+        if (title and 
			
 
				+            len(title) > 10 and len(title) < 500 and 
			
 
				+            not any(bad in title.lower() for bad in ['abstract', 'keywords', 'introduction', 'contents'])):
			
 
				+            cleaned['title'] = title
			
 
				+        else:
			
 
				+            cleaned['title'] = original_data.get('title', '')
			
 
				+        
			
 
				+        # Autorid
			
 
				+        authors = enhanced_data.get('authors', [])
			
 
				+        if isinstance(authors, list):
			
 
				+            cleaned_authors = []
			
 
				+            for author in authors:
			
 
				+                if isinstance(author, str):
			
 
				+                    author_clean = author.strip()
			
 
				+                    # Eemalda ebareaalsed autorid
			
 
				+                    if (len(author_clean) > 2 and len(author_clean) < 100 and
			
 
				+                        not any(char.isdigit() for char in author_clean) and
			
 
				+                        not '@' in author_clean and
			
 
				+                        not 'university' in author_clean.lower() and
			
 
				+                        not 'institute' in author_clean.lower()):
			
 
				+                        cleaned_authors.append(author_clean)
			
 
				+            
			
 
				+            if cleaned_authors:
			
 
				+                cleaned['authors'] = cleaned_authors
			
 
				+            else:
			
 
				+                cleaned['authors'] = original_data.get('authors', [])
			
 
				+        else:
			
 
				+            cleaned['authors'] = original_data.get('authors', [])
			
 
				+        
			
 
				+        # Aasta
			
 
				+        year = str(enhanced_data.get('year', '')).strip()
			
 
				+        if year.isdigit() and 1900 <= int(year) <= 2025:
			
 
				+            cleaned['year'] = year
			
 
				+        else:
			
 
				+            cleaned['year'] = original_data.get('year', '')
			
 
				+        
			
 
				+        # Žurnaal
			
 
				+        journal = enhanced_data.get('journal', '').strip()
			
 
				+        if journal and len(journal) < 200:
			
 
				+            cleaned['journal'] = journal
			
 
				+        else:
			
 
				+            cleaned['journal'] = original_data.get('journal', '')
			
 
				+        
			
 
				+        # DOI
			
 
				+        doi = enhanced_data.get('doi', '').strip()
			
 
				+        if doi and (doi.startswith('10.') or 'doi.org' in doi):
			
 
				+            cleaned['doi'] = doi
			
 
				+        else:
			
 
				+            cleaned['doi'] = original_data.get('doi', '')
			
 
				+        
			
 
				+        return cleaned
			
 
				+    
			
 
				+    def extract_metadata_directly(self, text: str) -> Dict:
			
 
				+        """
			
 
				+        Otsi metainfot otse tekstist ilma kontekstita
			
 
				+        Kasulik, kui algne metadata on täiesti valesti
			
 
				+        """
			
 
				+        logger.info("Otsin metainfot otse tekstist...")
			
 
				+        
			
 
				+        system_prompt = """Otsi antud teadusartikli tekstist pealkirja, autoreid, 
			
 
				+avaldamisaastat, žurnaali ja DOI-d. Tagasta tulemus JSON formaadis.
			
 
				+"""
			
 
				+        
			
 
				+        user_prompt = f"""Artikli tekst (esimesed 3000 märki):
			
 
				+{text[:3000]}
			
 
				+
			
 
				+Palun otsi metainfot. Tagasta VAID JSON.
			
 
				+"""
			
 
				+        
			
 
				+        messages = [
			
 
				+            {"role": "system", "content": system_prompt},
			
 
				+            {"role": "user", "content": user_prompt}
			
 
				+        ]
			
 
				+        
			
 
				+        try:
			
 
				+            response = self.deepseek_client.call_api(messages, temperature=0.3)
			
 
				+            
			
 
				+            if response:
			
 
				+                json_match = re.search(r'\{.*\}', response, re.DOTALL)
			
 
				+                if json_match:
			
 
				+                    json_str = json_match.group(0)
			
 
				+                    try:
			
 
				+                        metadata = json.loads(json_str)
			
 
				+                        return self._clean_enhanced_metadata(metadata, {})
			
 
				+                    except:
			
 
				+                        pass
			
 
				+        except Exception as e:
			
 
				+            logger.error(f"Viga otse metainfo eraldamisel: {e}")
			
 
				+        
			
 
				+        return {}
			
 
				+    
			
 
				+    def is_metadata_valid(self, metadata: Dict) -> bool:
			
 
				+        """Kontrolli, kas metadata on usaldusväärne"""
			
 
				+        # Kontrolli pealkirja
			
 
				+        title = metadata.get('title', '')
			
 
				+        if not title or len(title) < 5 or len(title) > 500:
			
 
				+            return False
			
 
				+        
			
 
				+        # Kontrolli autoreid
			
 
				+        authors = metadata.get('authors', [])
			
 
				+        if not authors:
			
 
				+            return False
			
 
				+        
			
 
				+        # Kontrolli, et autorid ei oleks aadressid või muud jama
			
 
				+        for author in authors:
			
 
				+            if (len(author) > 100 or 
			
 
				+                any(char.isdigit() for char in author) or
			
 
				+                '@' in author or
			
 
				+                'university' in author.lower() or
			
 
				+                'institute' in author.lower()):
			
 
				+                return False
			
 
				+        
			
 
				+        # Kontrolli aastat
			
 
				+        year = str(metadata.get('year', ''))
			
 
				+        if not year.isdigit() or not (1900 <= int(year) <= 2025):
			
 
				+            return False
			
 
				+        
			
 
				+        return True
			
--- a/src/pdf_processor.py
+++ b/src/pdf_processor.py
@@ -137,7 +137,7 @@ class PDFProcessor:
 
				         

			
 
				         return full_text, sections

			
 
				     

			
 
				-    def extract_structured_metadata(self, text: str) -> Dict:

			
 
				+    def extract_structured_metadata(self, filepath: str, text: str) -> Dict:

			
 
				         """Proovi eraldada struktureeritud metainfo tekstist"""

			
 
				         metadata = {

			
 
				             'title': '',

			
@@ -148,53 +148,149 @@ class PDFProcessor:
 
				             'keywords': []

			
 
				         }

			
 
				         

			
 
				-        # Otsi pealkirja (esimene suurem rida)

			
 
				+        # Proovi kõigepealt PDF sisemisest metadata-st (kui on)

			
 
				+        if filepath:

			
 
				+            try:

			
 
				+                with open(filepath, 'rb') as file:

			
 
				+                    pdf_reader = PyPDF2.PdfReader(file)

			
 
				+                    pdf_meta = pdf_reader.metadata

			
 
				+                    

			
 
				+                    if pdf_meta:

			
 
				+                        if pdf_meta.get('/Title') and pdf_meta['/Title'].strip():

			
 
				+                            metadata['title'] = pdf_meta['/Title'].strip()

			
 
				+                        if pdf_meta.get('/Author') and pdf_meta['/Author'].strip():

			
 
				+                            authors = pdf_meta['/Author'].split(';')

			
 
				+                            metadata['authors'] = [a.strip() for a in authors]

			
 
				+            except Exception:

			
 
				+                self.logger.warning(f"PDF metadata lugemine ebaõnnestus: {filepath}")

			
 
				+        

			
 
				+        # Otsi pealkirja (esimene suurem rida, mis ei ole liiga lühike ega pikk)

			
 
				         lines = text.split('\n')

			
 
				-        for line in lines:

			
 
				+        

			
 
				+        # Eemalda tühjad read

			
 
				+        lines = [line.strip() for line in lines if line.strip()]

			
 
				+        

			
 
				+        for i, line in enumerate(lines):

			
 
				             line = line.strip()

			
 
				-            if len(line) > 20 and len(line) < 200 and not line.startswith('http'):

			
 
				-                if not metadata['title'] and line[0].isupper():

			
 
				-                    metadata['title'] = line

			
 
				+            # Heuristika pealkirja tuvastamiseks

			
 
				+            if (len(line) > 15 and len(line) < 200 and 

			
 
				+                not line.startswith('http') and

			
 
				+                not line.startswith('DOI:') and

			
 
				+                not line.startswith('doi:') and

			
 
				+                not 'abstract' in line.lower() and

			
 
				+                not 'keyword' in line.lower() and

			
 
				+                not 'introduction' in line.lower() and

			
 
				+                not '©' in line and

			
 
				+                not 'corresponding author' in line.lower() and

			
 
				+                not '@' in line and

			
 
				+                not line[0].isdigit() and  # Ei alga numbriga

			
 
				+                not re.match(r'^[\d\s]+$', line) and  # Ei ole ainult numbrid ja tühikud

			
 
				+                not re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$', line) and  # Ei ole autorinimi

			
 
				+                not re.match(r'^\w+@\w+\.\w+$', line) and  # Ei ole email

			
 
				+                not re.match(r'^tel(\.|ephone)?:\s*\+?[\d\s\-]+$', line, re.IGNORECASE) and  # Ei ole telefon

			
 
				+                not re.match(r'^fax:\s*\+?[\d\s\-]+$', line, re.IGNORECASE) and  # Ei ole fax

			
 
				+                not re.match(r'^\d{4}\s*$', line) and  # Ei ole aastaarv

			
 
				+                not metadata['title']):  # Kui pole veel pealkirja

			
 
				+                

			
 
				+                metadata['title'] = line

			
 
				                 break

			
 
				         

			
 
				-        # Otsi autoreid (tüüpiline muster)

			
 
				-        for i, line in enumerate(lines):

			
 
				-            if 'author' in line.lower() or 'authors' in line.lower():

			
 
				-                # Proovi järgmised 3 rida

			
 
				-                for j in range(1, 4):

			
 
				-                    if i + j < len(lines):

			
 
				-                        author_line = lines[i + j].strip()

			
 
				-                        if author_line and len(author_line) < 300:

			
 
				-                            # Eralda nimed komade või 'and' järgi

			
 
				-                            authors = re.split(r',|\band\b|;', author_line)

			
 
				-                            metadata['authors'] = [a.strip() for a in authors if a.strip()]

			
 
				-                            break

			
 
				+        # Otsi autoreid (tüüpilised mustrid)

			
 
				+        author_patterns = [

			
 
				+            r'^\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s*,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)+(?:\s*,\s*and\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)*)\s*$',

			
 
				+            r'^\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s*&\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)+)\s*$',

			
 
				+            r'^\s*([A-Z]\.\s+[A-Z][a-z]+(?:\s*,\s*[A-Z]\.\s+[A-Z][a-z]+)*)\s*$'

			
 
				+        ]

			
 
				+        

			
 
				+        # Otsi autorrid pealkirja ja abstrakti vahelt

			
 
				+        for i in range(min(20, len(lines))):  # Vaata esimesi 20 rida

			
 
				+            line = lines[i]

			
 
				+            

			
 
				+            # Kontrolli autorimustreid

			
 
				+            for pattern in author_patterns:

			
 
				+                match = re.match(pattern, line)

			
 
				+                if match and len(line) < 200:  # Liiga pikk ei saa olla autor

			
 
				+                    # Eralda nimed

			
 
				+                    if '&' in line:

			
 
				+                        authors = [a.strip() for a in line.split('&')]

			
 
				+                    elif ' and ' in line:

			
 
				+                        parts = line.split(' and ')

			
 
				+                        authors = []

			
 
				+                        for part in parts:

			
 
				+                            if ',' in part:

			
 
				+                                authors.extend([a.strip() for a in part.split(',')])

			
 
				+                            else:

			
 
				+                                authors.append(part.strip())

			
 
				+                    elif ',' in line:

			
 
				+                        authors = [a.strip() for a in line.split(',')]

			
 
				+                    else:

			
 
				+                        authors = [line.strip()]

			
 
				+                    

			
 
				+                    # Filtreeri ebareaalsed autorid

			
 
				+                    authors = [a for a in authors if len(a) > 3 and len(a) < 50 and not any(char.isdigit() for char in a)]

			
 
				+                    if authors:

			
 
				+                        metadata['authors'] = authors

			
 
				+                        break

			
 
				         

			
 
				         # Otsi aastat

			
 
				-        year_pattern = r'\((\d{4})\)|(\d{4})\s*[A-Z]'

			
 
				-        for line in lines:

			
 
				-            match = re.search(year_pattern, line)

			
 
				-            if match:

			
 
				-                metadata['year'] = match.group(1) or match.group(2)

			
 
				+        year_patterns = [

			
 
				+            r'\((\d{4})\)',

			
 
				+            r'\b(\d{4})\b',

			
 
				+            r'©\s*(\d{4})',

			
 
				+            r'\b(19\d{2}|20\d{2})\b'

			
 
				+        ]

			
 
				+        

			
 
				+        for line in lines[:50]:  # Vaata esimesi 50 rida

			
 
				+            for pattern in year_patterns:

			
 
				+                matches = re.findall(pattern, line)

			
 
				+                for match in matches:

			
 
				+                    if match.isdigit() and 1900 <= int(match) <= 2025:

			
 
				+                        metadata['year'] = match

			
 
				+                        break

			
 
				+            if metadata['year']:

			
 
				                 break

			
 
				         

			
 
				         # Otsi DOI

			
 
				-        doi_pattern = r'doi:\s*([^\s]+|10\.\d{4,9}/[-._;()/:A-Z0-9]+)'

			
 
				+        doi_patterns = [

			
 
				+            r'doi:\s*([^\s]+)',

			
 
				+            r'DOI:\s*([^\s]+)',

			
 
				+            r'10\.\d{4,9}/[-._;()/:A-Za-z0-9]+'

			
 
				+        ]

			
 
				+        

			
 
				         for line in lines:

			
 
				-            match = re.search(doi_pattern, line, re.IGNORECASE)

			
 
				-            if match:

			
 
				-                metadata['doi'] = match.group(1)

			
 
				+            for pattern in doi_patterns:

			
 
				+                match = re.search(pattern, line, re.IGNORECASE)

			
 
				+                if match:

			
 
				+                    doi_text = match.group(1) if 'doi:' not in pattern.lower() else match.group(1).lstrip('doi:').strip()

			
 
				+                    metadata['doi'] = doi_text

			
 
				+                    break

			
 
				+            if metadata['doi']:

			
 
				+                break

			
 
				+        

			
 
				+        # Otsi žurnaali nime

			
 
				+        journal_indicators = ['Journal of', 'Transportation', 'Transport', 'Geography', 'Science', 'Research', 'Review']

			
 
				+        for i, line in enumerate(lines[:30]):

			
 
				+            line_lower = line.lower()

			
 
				+            if any(indicator in line for indicator in journal_indicators) and len(line) < 100:

			
 
				+                # Vaata, kas järgnevad read on seotud

			
 
				+                if i + 1 < len(lines) and lines[i + 1]:

			
 
				+                    if len(lines[i + 1]) < 100 and not any(char.isdigit() for char in lines[i + 1]):

			
 
				+                        metadata['journal'] = f"{line} {lines[i + 1]}".strip()

			
 
				+                else:

			
 
				+                    metadata['journal'] = line.strip()

			
 
				                 break

			
 
				         

			
 
				         # Otsi võtmesõnu

			
 
				-        for line in lines:

			
 
				+        for i, line in enumerate(lines):

			
 
				             if 'keyword' in line.lower():

			
 
				                 # Proovi järgmised read

			
 
				                 for j in range(1, 3):

			
 
				                     if i + j < len(lines):

			
 
				                         kw_line = lines[i + j].strip()

			
 
				                         if kw_line:

			
 
				-                            metadata['keywords'] = [k.strip() for k in re.split(r',|;', kw_line)]

			
 
				+                            # Eralda komade või semikoolonitega

			
 
				+                            keywords = re.split(r',|;', kw_line)

			
 
				+                            metadata['keywords'] = [k.strip() for k in keywords if k.strip()]

			
 
				                             break

			
 
				         

			
 
				         return metadata

			
@@ -209,19 +305,24 @@ class PDFProcessor:
 
				             full_text, sections = self.extract_text_from_pdf(filepath)

			
 
				             

			
 
				             # Eralda struktureeritud metainfo

			
 
				-            structured_meta = self.extract_structured_metadata(full_text)

			
 
				+            structured_meta = self.extract_structured_metadata(filepath,full_text)

			
 
				             

			
 
				             # Ühenda kõik andmed

			
 
				             result = {

			
 
				                 'pdf_metadata': asdict(metadata),

			
 
				                 'structured_metadata': structured_meta,

			
 
				-                'full_text': full_text[:5000],  # Säästa mälu, salvesta ainult algus

			
 
				+                'full_text': full_text[:8000],  # Säästa mälu, salvesta ainult algus (suurendatud)

			
 
				                 'sections': [asdict(s) for s in sections],

			
 
				                 'processing_date': datetime.now().isoformat(),

			
 
				                 'word_count': len(full_text.split())

			
 
				             }

			
 
				             

			
 
				             self.logger.info(f"PDF töödeldud: {metadata.filename}")

			
 
				+            self.logger.info(f"  Pealkiri: {structured_meta.get('title', 'Teadmata')}")

			
 
				+            self.logger.info(f"  Autorid: {structured_meta.get('authors', [])}")

			
 
				+            self.logger.info(f"  Aasta: {structured_meta.get('year', 'Teadmata')}")

			
 
				+            self.logger.info(f"  DOI: {structured_meta.get('doi', 'Teadmata')}")

			
 
				+            

			
 
				             return result

			
 
				             

			
 
				         except Exception as e:

			
--- a/src/pipeline.py
+++ b/src/pipeline.py
@@ -10,6 +10,7 @@ from .pdf_processor import PDFProcessor
 
				 from .deepseek_client import DeepSeekClient

			
 
				 from .embedding_generator import EmbeddingGenerator

			
 
				 from .weaviate_client import WeaviateClient

			
 
				+from .metadata_enhancer import MetadataEnhancer

			
 
				 from .config import config

			
 
				 from .utils import setup_logging, save_processed_article

			
 
				 

			
@@ -23,12 +24,14 @@ class ArticleProcessingPipeline:
 
				         self.deepseek_client = DeepSeekClient()

			
 
				         self.embedding_generator = EmbeddingGenerator()

			
 
				         self.weaviate_client = WeaviateClient()

			
 
				+        self.metadata_enhancer = MetadataEnhancer()

			
 
				         

			
 
				         self.stats = {

			
 
				             'processed': 0,

			
 
				             'saved': 0,

			
 
				             'skipped': 0,

			
 
				-            'errors': 0

			
 
				+            'errors': 0,

			
 
				+            'metadata_enhanced': 0

			
 
				         }

			
 
				     

			
 
				     def process_single_article(self, pdf_path: str) -> Optional[Dict]:

			
@@ -39,51 +42,79 @@ class ArticleProcessingPipeline:
 
				             # 1. PDF töötlus

			
 
				             pdf_data = self.pdf_processor.process_pdf(pdf_path)

			
 
				             

			
 
				-            # 2. Eralda abstrakt (kui leiad)

			
 
				+            # 2. Täiusta metainfot AI-ga

			
 
				+            structured_metadata = pdf_data['structured_metadata']

			
 
				+            

			
 
				+            # Kontrolli, kas algne metadata on usaldusväärne

			
 
				+            if not self.metadata_enhancer.is_metadata_valid(structured_metadata):

			
 
				+                logger.info(f"Algne metadata ebausaldusväärne, täiustan AI-ga")

			
 
				+                

			
 
				+                # Proovi kõigepealt täiustada olemasolevat

			
 
				+                enhanced_metadata = self.metadata_enhancer.enhance_metadata_with_ai(

			
 
				+                    text=pdf_data['full_text'],

			
 
				+                    current_metadata=structured_metadata

			
 
				+                )

			
 
				+                

			
 
				+                # Kui ikka ei ole usaldusväärne, otsi otse tekstist

			
 
				+                if not self.metadata_enhancer.is_metadata_valid(enhanced_metadata):

			
 
				+                    logger.info(f"AI täiustamine ebaõnnestus, otsin otse tekstist")

			
 
				+                    enhanced_metadata = self.metadata_enhancer.extract_metadata_directly(

			
 
				+                        pdf_data['full_text']

			
 
				+                    )

			
 
				+                    self.stats['metadata_enhanced'] += 1

			
 
				+                

			
 
				+                # Kasuta täiustatud metadata't

			
 
				+                if enhanced_metadata:

			
 
				+                    # Säilita väärtused, mida AI ei täiustanud

			
 
				+                    for key in ['title', 'authors', 'year', 'journal', 'doi']:

			
 
				+                        if key in enhanced_metadata and enhanced_metadata[key]:

			
 
				+                            structured_metadata[key] = enhanced_metadata[key]

			
 
				+            

			
 
				+            # 3. Eralda abstrakt (kui leiad)

			
 
				             abstract_en = ""

			
 
				             for section in pdf_data['sections']:

			
 
				                 if section['section_type'] == 'abstract':

			
 
				                     abstract_en = section['content']

			
 
				                     break

			
 
				             

			
 
				-            # 3. DeepSeek analüüs

			
 
				+            # 4. DeepSeek analüüs

			
 
				             summary_data = self.deepseek_client.create_summary(

			
 
				                 text=pdf_data['full_text'],

			
 
				                 context={

			
 
				-                    'title': pdf_data['structured_metadata'].get('title', ''),

			
 
				-                    'authors': pdf_data['structured_metadata'].get('authors', []),

			
 
				-                    'year': pdf_data['structured_metadata'].get('year', ''),

			
 
				-                    'journal': pdf_data['structured_metadata'].get('journal', '')

			
 
				+                    'title': structured_metadata.get('title', ''),

			
 
				+                    'authors': structured_metadata.get('authors', []),

			
 
				+                    'year': structured_metadata.get('year', ''),

			
 
				+                    'journal': structured_metadata.get('journal', '')

			
 
				                 }

			
 
				             )

			
 
				             

			
 
				-            # 4. Eralda võtmesõnad

			
 
				+            # 5. Eralda võtmesõnad

			
 
				             key_concepts = self.deepseek_client.extract_key_concepts(

			
 
				                 text=pdf_data['full_text'],

			
 
				                 summary=summary_data['summary_et']

			
 
				             )

			
 
				             

			
 
				-            # 5. Tuvasta meetodid

			
 
				+            # 6. Tuvasta meetodid

			
 
				             methods_used = self.deepseek_client.identify_methods(pdf_data['full_text'])

			
 
				             

			
 
				-            # 6. Analüüsi transpordi kontekst

			
 
				+            # 7. Analüüsi transpordi kontekst

			
 
				             transport_context = self.deepseek_client.analyze_transport_context(

			
 
				                 summary_data['summary_et']

			
 
				             )

			
 
				             

			
 
				-            # 7. Koosta lõplik artikkel

			
 
				+            # 8. Koosta lõplik artikkel

			
 
				             article_data = {

			
 
				                 # PDF metainfo

			
 
				                 'pdf_metadata': pdf_data['pdf_metadata'],

			
 
				                 'source_file': pdf_path,

			
 
				                 'file_hash': pdf_data['pdf_metadata']['file_hash'],

			
 
				                 

			
 
				-                # Struktureeritud metainfo

			
 
				-                'title': pdf_data['structured_metadata'].get('title', ''),

			
 
				-                'authors': pdf_data['structured_metadata'].get('authors', []),

			
 
				-                'year': pdf_data['structured_metadata'].get('year', ''),

			
 
				-                'journal': pdf_data['structured_metadata'].get('journal', ''),

			
 
				-                'doi': pdf_data['structured_metadata'].get('doi', ''),

			
 
				+                # Struktureeritud metainfo (AI-ga täiustatud)

			
 
				+                'title': structured_metadata.get('title', ''),

			
 
				+                'authors': structured_metadata.get('authors', []),

			
 
				+                'year': structured_metadata.get('year', ''),

			
 
				+                'journal': structured_metadata.get('journal', ''),

			
 
				+                'doi': structured_metadata.get('doi', ''),

			
 
				                 

			
 
				                 # Tekstisisu

			
 
				                 'abstract_en': abstract_en,

			
@@ -101,13 +132,18 @@ class ArticleProcessingPipeline:
 
				                 'section_count': len(pdf_data['sections'])

			
 
				             }

			
 
				             

			
 
				-            # 8. Genereeri embeddingud

			
 
				+            # Logi lõplik metadata

			
 
				+            logger.info(f"Lõplik metadata: {article_data['title'][:50]}...")

			
 
				+            logger.info(f"  Autorid: {article_data['authors']}")

			
 
				+            logger.info(f"  Aasta: {article_data['year']}")

			
 
				+            

			
 
				+            # 9. Genereeri embeddingud

			
 
				             embeddings = self.embedding_generator.generate_article_embeddings(article_data)

			
 
				             

			
 
				-            # 9. Salvesta Weaviate'i

			
 
				+            # 10. Salvesta Weaviate'i

			
 
				             saved = self.weaviate_client.save_article(article_data, embeddings)

			
 
				             

			
 
				-            # 10. Salvesta töödeldud andmed faili

			
 
				+            # 11. Salvesta töödeldud andmed faili

			
 
				             processed_data = {

			
 
				                 'article': article_data,

			
 
				                 'embeddings_summary': embeddings.get('summary', []),

			
@@ -115,7 +151,8 @@ class ArticleProcessingPipeline:
 
				                     'pdf_pages': pdf_data['pdf_metadata']['page_count'],

			
 
				                     'summary_length': summary_data['summary_length'],

			
 
				                     'concepts_count': len(key_concepts),

			
 
				-                    'methods_count': len(methods_used)

			
 
				+                    'methods_count': len(methods_used),

			
 
				+                    'metadata_enhanced': self.stats['metadata_enhanced']

			
 
				                 }

			
 
				             }

			
 
				             

			
@@ -135,6 +172,8 @@ class ArticleProcessingPipeline:
 
				         except Exception as e:

			
 
				             self.stats['errors'] += 1

			
 
				             logger.error(f"Viga artikli töötlemisel {pdf_path}: {str(e)}")

			
 
				+            import traceback

			
 
				+            logger.error(traceback.format_exc())

			
 
				             return None

			
 
				     

			
 
				     def process_batch(self, pdf_files: List[str]) -> List[Dict]:

			
@@ -147,7 +186,7 @@ class ArticleProcessingPipeline:
 
				                 results.append(result)

			
 
				             

			
 
				             # Väike paus, et mitte üle koormata API-d

			
 
				-            time.sleep(1)

			
 
				+            time.sleep(1.5)

			
 
				         

			
 
				         return results

			
 
				     

			
@@ -192,6 +231,7 @@ class ArticleProcessingPipeline:
 
				         logger.info(f"Salvestatud artikleid: {self.stats['saved']}")

			
 
				         logger.info(f"Vahele jäetud (duplikaadid): {self.stats['skipped']}")

			
 
				         logger.info(f"Vigaseid töötlusi: {self.stats['errors']}")

			
 
				+        logger.info(f"AI-ga täiustatud metainfosid: {self.stats['metadata_enhanced']}")

			
 
				         logger.info("="*50)

			
 
				     

			
 
				     def close(self):