Quellcode durchsuchen

Vahetulemus 5-st 3 artikli import korras

Ardo Kubjas vor 4 Monaten
Ursprung
Commit
ca37369b2b

+ 3 - 0
.gitignore

@@ -61,4 +61,7 @@ target/
 # Ardo
 data/logs
 data/pdfs
+data/processed
+tmp/
+.venv/
 ..env.swp

+ 9 - 0
LOEMIND.md

@@ -0,0 +1,9 @@
+
+```bash
+sudo apt install python3-ipykernel
+cd ~/rag-demo
+python3 -m venv .venv
+source .venv/bin/activate
+pip install -r requirements.txt
+pip install -r transpordi_artiklid/requirements.txt
+```

+ 96 - 0
LOEMIND_GIT.md

@@ -0,0 +1,96 @@
+# Git Repo Kasutusjuhend
+
+## Põhitoimingud
+
+### Algus
+```bash
+# Klooni repo (kui pole veel tehtud)
+git clone https://gots.odamus.com/weaviate/transpordi_artiklid.git
+cd transpordi_artiklid
+```
+
+### Igapäevased toimingud
+
+1. **Enne tööd alustamist uuenda kohalikku haru:**
+```bash
+git pull origin master
+```
+
+2. **Vaata muudatusi:**
+```bash
+git status
+git diff
+```
+
+3. **Lisa muudatused:**
+```bash
+# Lisa konkreetne fail
+git add <faili_nimi>
+
+# Lisa kõik muudatused
+git add .
+
+# Lisa osaliselt (interaktiivne)
+git add -p
+```
+
+4. **Loo commit:**
+```bash
+git commit -m "Selgitav sõnum"
+```
+
+5. **Saada muudatused serverisse:**
+```bash
+git push origin master
+```
+
+### Harude (branches) kasutamine
+
+```bash
+# Uue haru loomine
+git checkout -b uus-haru
+
+# Haru vahetamine
+git checkout haru-nimi
+
+# Haru ühendamine masterisse
+git checkout master
+git merge uus-haru
+git push origin master
+```
+
+## Olulised nõuanded
+
+### 1. **Enne push'i alati pull**
+Alati tee `git pull` enne kui hakkad tööle, et vältida konflikte.
+
+### 2. **Kommenteeri hästi**
+Commit sõnumid peaksid olema selged ja kirjeldavad.
+
+### 3. **Ära push'i suuri faile**
+Git pole mõeldud piltide, videote või muude suurte failide jaoks. Kasuta .gitignore faili.
+
+### 4. **Tööta harudes**
+Suurte muudatuste tegemisel loo uus haru, et mitte segada peaharuga.
+
+### 5. **Salvesta tihti**
+Tee commite tihti - parem palju väikeseid commite kui üks suur.
+
+## Abikäsud
+
+```bash
+# Ajalugu vaatamine
+git log --oneline
+
+# Muudatuste tagasivõtmine
+git checkout -- <fail>
+
+# Viimase commit'i muutmine
+git commit --amend
+```
+
+## Seadistus
+Sinu repo on seadistatud jälgima kaugrepot `origin` aadressil:
+`https://gogs.odamus.com/weaviate/transpordi_artiklid.git`
+
+Peaharu (`master`) on seotud kaugrepo master haruga.

+ 1 - 1
README.md

@@ -14,6 +14,6 @@ Süsteem teadusartiklite automaatseks töötlemiseks, analüüsiks ja salvestami
 
 1. Klooni repository:
 ```bash
-git clone [repository-url]
+git clone https://gots.odamus.com/weaviate/transpordi_artiklid.git
 cd transpordi_artiklid
 ```

Datei-Diff unterdrückt, da er zu groß ist
+ 0 - 19
data/processed/article_20251229_093529.json


Datei-Diff unterdrückt, da er zu groß ist
+ 0 - 22
data/processed/article_20251229_093733.json


Datei-Diff unterdrückt, da er zu groß ist
+ 0 - 22
data/processed/article_20251229_093944.json


Datei-Diff unterdrückt, da er zu groß ist
+ 0 - 21
data/processed/article_20251229_094153.json


Datei-Diff unterdrückt, da er zu groß ist
+ 0 - 25
data/processed/article_20251229_094407.json


Datei-Diff unterdrückt, da er zu groß ist
+ 83 - 0
jupyter/WeaviateExportImport.ipynb


+ 219 - 0
jupyter/weaviate_export_import_clean.py

@@ -0,0 +1,219 @@
+"""
+Weaviate Collection Export/Import Utility
+
+Korduvkasutatav tööriist Weaviate kollektsioonide eksportimiseks ja importimiseks JSON backup failide kaudu.
+Toetab UUID normaliseerimist, int/float tüüpe, doc_hash, vigu ja batch operatsioone.
+"""
+import datetime
+import json
+import uuid
+import logging
+from pathlib import Path
+from typing import Dict, List, Any, Optional, Union
+from weaviate import WeaviateClient, ConnectionParams
+from weaviate.classes.config import Property, DataType
+from decimal import Decimal
+import ijson
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+class WeaviateExportImport:
+    """Korduvkasutatav klass Weaviate kollektsioonide eksportimiseks ja importimiseks."""
+
+    def __init__(self, src_client: Optional[WeaviateClient] = None, dst_client: Optional[WeaviateClient] = None):
+        self.src_client = src_client
+        self.dst_client = dst_client
+
+    @staticmethod
+    def create_client(host: str, http_port: int = 9020, grpc_port: int = 50051, secure: bool = False) -> WeaviateClient:
+        client = WeaviateClient(connection_params=ConnectionParams.from_params(
+            http_host=host,
+            http_port=http_port,
+            http_secure=secure,
+            grpc_host=host,
+            grpc_port=grpc_port,
+            grpc_secure=secure,
+        ))
+        client.connect()
+        logger.info(f"Ühendatud Weaviate'ga: {host}:{http_port}")
+        return client
+
+
+    def normalize_int_fields(self, props: Dict[str, Any], int_fields: List[str] = None) -> Dict[str, Any]:
+        if int_fields is None:
+            int_fields = ["page_start", "page_end", "chunk"]
+
+        for field in int_fields:
+            if field in props:
+                value = props[field]
+                # Kui väärtus on float, Decimal või int-tüüp
+                if isinstance(value, float) and value.is_integer():
+                    props[field] = int(value)
+                elif isinstance(value, Decimal):
+                    # kasuta kas int() või float(), olenevalt kontekstist
+                    props[field] = int(value) if value % 1 == 0 else float(value)
+        return props
+
+
+    def normalize_doc_hash(self, doc_hash: Any) -> str:
+        if isinstance(doc_hash, uuid.UUID):
+            return doc_hash.hex
+        if isinstance(doc_hash, str) and len(doc_hash) == 36 and "-" in doc_hash:
+            return doc_hash.replace("-", "")
+        return str(doc_hash)
+
+    def clean_uuid(self, obj: Any) -> Any:
+        if isinstance(obj, dict):
+            return {k: self.clean_uuid(v) for k, v in obj.items()}
+        if isinstance(obj, (list, tuple)):
+            return [self.clean_uuid(x) for x in obj]
+        if isinstance(obj, uuid.UUID):
+            return str(obj)
+        if hasattr(obj, "__str__") and obj.__class__.__name__.lower().startswith("uuid"):
+            return str(obj)
+        return obj
+
+    def process_properties(self, props: Dict[str, Any], int_fields: List[str] = None, hash_fields: List[str] = None) -> Dict[str, Any]:
+        if hash_fields is None:
+            hash_fields = ["doc_hash"]
+        props = self.clean_uuid(props)
+        props = self.normalize_int_fields(props, int_fields)
+        for field in hash_fields:
+            if field in props:
+                props[field] = self.normalize_doc_hash(props[field])
+        return props
+
+    def export_collection(self, collection_name: str, output_file: Union[str, Path], 
+                                    int_fields: List[str] = None, hash_fields: List[str] = None, 
+                                    include_vectors: bool = True) -> int:
+        if not self.src_client:
+            raise ValueError("Source client pole määratud")
+        
+        logger.info(f"Alustan kollektsiooni '{collection_name}' streaming eksporti...")
+        collection = self.src_client.collections.get(collection_name)
+        output_path = Path(output_file)
+        count = 0
+        
+        def custom_json_encoder(obj):
+            if isinstance(obj, datetime.datetime):
+                return obj.isoformat()
+            return str(obj)
+        
+        # Kirjuta otse faili, mitte mällu
+        with open(output_path, "w", encoding="utf-8") as f:
+            f.write("[\n")  # alusta JSON array
+            first = True
+            
+            for item in collection.iterator(include_vector=include_vectors):
+                props = self.process_properties(dict(item.properties), 
+                                            int_fields=int_fields, 
+                                            hash_fields=hash_fields)
+                export_obj = {
+                    'uuid': str(item.uuid),
+                    'properties': props,
+                }
+                
+                if include_vectors:
+                    export_obj['vector'] = item.vector
+                
+                # Kirjuta objekt otse faili
+                if not first:
+                    f.write(",\n")
+                json.dump(export_obj, f, ensure_ascii=False, default=custom_json_encoder)
+                first = False
+                count += 1
+                
+                # Progress log iga 1000 objekti järel
+                if count % 1000 == 0:
+                    logger.info(f"Eksporditud: {count} objekti...")
+            
+            f.write("\n]")  # lõpeta JSON array
+        
+        logger.info(f"Eksport valmis: {count} objekti")
+        return count
+    
+    def clean_decimals(self, obj: Any) -> Any:
+        '''Teisenda kõik Decimal objektid float-ideks'''
+        if isinstance(obj, Decimal):
+            return float(obj)
+        if isinstance(obj, dict):
+            return {k: self.clean_decimals(v) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [self.clean_decimals(item) for item in obj]
+        return obj
+
+    def import_collection(self, collection_name: str, input_file: Union[str, Path],
+                        int_fields: List[str] = None, batch_size: int = 100,
+                        recreate_collection: bool = False) -> int:
+        if not self.dst_client:
+            raise ValueError("Destination client pole määratud")
+        
+        logger.info(f"Alustan kollektsiooni '{collection_name}' streaming importi...")
+        collection = self.dst_client.collections.get(collection_name)
+        input_path = Path(input_file)
+        
+        imported_count = 0
+        batch = []
+        
+        # ijson.items loeb faili osade kaupa, mitte kogu faili mällu
+        with open(input_path, 'rb') as f:
+            # ✅ PARANDUS 1: Lisa use_decimal=False
+            for obj in ijson.items(f, 'item', use_float=True):
+                try:
+                    props = obj["properties"]
+                    props = self.clean_decimals(props)
+                    if int_fields:
+                        props = self.normalize_int_fields(props, int_fields)
+                    
+                    # ✅ PARANDUS 2: clean_decimals ka vectorile
+                    vector = obj.get("vector")
+                    if vector is not None:
+                        vector = self.clean_decimals(vector)
+                    
+                    batch.append({
+                        'uuid': str(obj["uuid"]),
+                        'properties': props,
+                        'vector': vector
+                    })
+                    
+                    # Kui batch täis, importi
+                    if len(batch) >= batch_size:
+                        self._import_batch(collection, batch)
+                        imported_count += len(batch)
+                        logger.info(f"Imporditud: {imported_count} objekti...")
+                        batch = []
+                        
+                except Exception as e:
+                    logger.warning(f"Import error: {e}")
+            
+            # Importi viimane batch
+            if batch:
+                self._import_batch(collection, batch)
+                imported_count += len(batch)
+        
+        logger.info(f"Import lõpetatud: {imported_count} objekti")
+        return imported_count
+
+
+    def _import_batch(self, collection, batch):
+        '''Batch import helper'''
+        for item in batch:
+            try:
+                collection.data.insert(
+                    properties=item['properties'],
+                    uuid=item['uuid'],
+                    vector=item.get('vector')
+                )
+            except Exception as e:
+                if "already exists" not in str(e):
+                    logger.warning(f"Insert error: {e}")
+
+
+    def close_clients(self):
+        if self.src_client:
+            self.src_client.close()
+            logger.info("Source client suletud")
+        if self.dst_client:
+            self.dst_client.close()
+            logger.info("Destination client suletud")

+ 44 - 0
recreate_schema.py

@@ -0,0 +1,44 @@
+#!/usr/bin/env python3
+"""
+Loo Weaviate'i klass uuesti ilma vektoriseerimismoodulita
+"""
+
+import sys
+import os
+import json
+
+# Lisa src kaust Pythoni teele
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+
+from src.weaviate_client import WeaviateClient
+from src.utils import setup_logging
+
+logger = setup_logging()
+
+def recreate_schema():
+    """Kustuta ja loo klass uuesti"""
+    try:
+        weaviate_client = WeaviateClient()
+        
+        # Kustuta olemasolev klass (kui on)
+        try:
+            weaviate_client.client.schema.delete_class("ScientificArticle")
+            logger.info("Klass kustutatud")
+        except Exception as e:
+            logger.warning(f"Klassi kustutamine ebaõnnestus: {e}")
+        
+        # Loo klass uuesti (skeemi loomine toimub automaatselt __init__ meetodis)
+        # Kuid me peame selle eksplitsiitselt kutsuma
+        weaviate_client._setup_schema()
+        
+        logger.info("Klass uuesti loodud ilma vektoriseerimismoodulita")
+        
+        weaviate_client.close()
+        
+    except Exception as e:
+        logger.error(f"Viga: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+
+if __name__ == "__main__":
+    recreate_schema()

+ 2 - 1
requirements.txt

@@ -5,4 +5,5 @@ chromadb
 langchain
 langchain-community
 pypdf
-PyPDF2
+PyPDF2
+jupyter

+ 216 - 0
src/metadata_enhancer.py

@@ -0,0 +1,216 @@
+import re
+import json
+from typing import Dict, List, Optional
+import logging
+from .deepseek_client import DeepSeekClient
+from .config import config
+
+logger = logging.getLogger(__name__)
+
+class MetadataEnhancer:
+    """Metadata täiustamine DeepSeeki abil"""
+    
+    def __init__(self):
+        self.deepseek_client = DeepSeekClient()
+    
+    def enhance_metadata_with_ai(self, text: str, current_metadata: Dict) -> Dict:
+        """
+        Täiusta metainfot DeepSeeki abil
+        
+        Args:
+            text: Artikli tekst (esimesed ~4000 märki)
+            current_metadata: Olemasolev metadata
+            
+        Returns:
+            Täiustatud metadata
+        """
+        logger.info("Täiustan metainfot DeepSeeki abil...")
+        
+        system_prompt = """Sa oled teadusartiklite metainfo spetsialist. 
+Sinu ülesanne on tuvastada antud teadusartikli õige pealkiri, autorid, avaldamisaasta,
+žurnaal ja DOI.
+
+Tagasta vastus JSON formaadis:
+{
+  "title": "õige pealkiri",
+  "authors": ["autor1", "autor2", ...],
+  "year": "avaldamisaasta",
+  "journal": "žurnaal/konverentsi nimetus",
+  "doi": "DOI identifikaator"
+}
+
+Kui mõni väli on tuvastamata, jäta see tühjaks.
+Auta valesti tuvastatud väärtusi parandada.
+"""
+        
+        user_prompt = f"""Tuvasta järgmise teadusartikli metainfo:
+
+CURRENT METADATA:
+- Pealkiri: {current_metadata.get('title', 'Teadmata')}
+- Autorid: {current_metadata.get('authors', [])}
+- Aasta: {current_metadata.get('year', 'Teadmata')}
+- Žurnaal: {current_metadata.get('journal', 'Teadmata')}
+- DOI: {current_metadata.get('doi', 'Teadmata')}
+
+ARTIKLI TEKST (esimesed 4000 märki):
+{text[:4000]}
+
+Palun analüüsi artiklit ja paranda või täienda metainfot. Tagasta VAID JSON.
+"""
+        
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+        
+        try:
+            response = self.deepseek_client.call_api(messages, temperature=0.3)
+            
+            # Proovi parsida JSON vastust
+            if response:
+                # Otsi JSON blokki tekstist
+                json_match = re.search(r'\{.*\}', response, re.DOTALL)
+                if json_match:
+                    json_str = json_match.group(0)
+                    try:
+                        enhanced_data = json.loads(json_str)
+                        
+                        # Valideeri ja puhasta andmed
+                        enhanced_data = self._clean_enhanced_metadata(enhanced_data, current_metadata)
+                        
+                        logger.info(f"Metainfo täiustatud AI-ga")
+                        return enhanced_data
+                    except json.JSONDecodeError as e:
+                        logger.error(f"JSON parsimise viga: {e}")
+                else:
+                    logger.error(f"Ei leidnud JSON-i vastuses: {response[:200]}")
+        except Exception as e:
+            logger.error(f"Viga AI metainfo täiustamisel: {e}")
+        
+        # Kui AI ei tööta, tagasta algne
+        return current_metadata
+    
+    def _clean_enhanced_metadata(self, enhanced_data: Dict, original_data: Dict) -> Dict:
+        """Puhasta ja valideeri täiustatud metadata"""
+        cleaned = {}
+        
+        # Pealkiri
+        title = enhanced_data.get('title', '').strip()
+        if (title and 
+            len(title) > 10 and len(title) < 500 and 
+            not any(bad in title.lower() for bad in ['abstract', 'keywords', 'introduction', 'contents'])):
+            cleaned['title'] = title
+        else:
+            cleaned['title'] = original_data.get('title', '')
+        
+        # Autorid
+        authors = enhanced_data.get('authors', [])
+        if isinstance(authors, list):
+            cleaned_authors = []
+            for author in authors:
+                if isinstance(author, str):
+                    author_clean = author.strip()
+                    # Eemalda ebareaalsed autorid
+                    if (len(author_clean) > 2 and len(author_clean) < 100 and
+                        not any(char.isdigit() for char in author_clean) and
+                        not '@' in author_clean and
+                        not 'university' in author_clean.lower() and
+                        not 'institute' in author_clean.lower()):
+                        cleaned_authors.append(author_clean)
+            
+            if cleaned_authors:
+                cleaned['authors'] = cleaned_authors
+            else:
+                cleaned['authors'] = original_data.get('authors', [])
+        else:
+            cleaned['authors'] = original_data.get('authors', [])
+        
+        # Aasta
+        year = str(enhanced_data.get('year', '')).strip()
+        if year.isdigit() and 1900 <= int(year) <= 2025:
+            cleaned['year'] = year
+        else:
+            cleaned['year'] = original_data.get('year', '')
+        
+        # Žurnaal
+        journal = enhanced_data.get('journal', '').strip()
+        if journal and len(journal) < 200:
+            cleaned['journal'] = journal
+        else:
+            cleaned['journal'] = original_data.get('journal', '')
+        
+        # DOI
+        doi = enhanced_data.get('doi', '').strip()
+        if doi and (doi.startswith('10.') or 'doi.org' in doi):
+            cleaned['doi'] = doi
+        else:
+            cleaned['doi'] = original_data.get('doi', '')
+        
+        return cleaned
+    
+    def extract_metadata_directly(self, text: str) -> Dict:
+        """
+        Otsi metainfot otse tekstist ilma kontekstita
+        Kasulik, kui algne metadata on täiesti valesti
+        """
+        logger.info("Otsin metainfot otse tekstist...")
+        
+        system_prompt = """Otsi antud teadusartikli tekstist pealkirja, autoreid, 
+avaldamisaastat, žurnaali ja DOI-d. Tagasta tulemus JSON formaadis.
+"""
+        
+        user_prompt = f"""Artikli tekst (esimesed 3000 märki):
+{text[:3000]}
+
+Palun otsi metainfot. Tagasta VAID JSON.
+"""
+        
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt}
+        ]
+        
+        try:
+            response = self.deepseek_client.call_api(messages, temperature=0.3)
+            
+            if response:
+                json_match = re.search(r'\{.*\}', response, re.DOTALL)
+                if json_match:
+                    json_str = json_match.group(0)
+                    try:
+                        metadata = json.loads(json_str)
+                        return self._clean_enhanced_metadata(metadata, {})
+                    except:
+                        pass
+        except Exception as e:
+            logger.error(f"Viga otse metainfo eraldamisel: {e}")
+        
+        return {}
+    
+    def is_metadata_valid(self, metadata: Dict) -> bool:
+        """Kontrolli, kas metadata on usaldusväärne"""
+        # Kontrolli pealkirja
+        title = metadata.get('title', '')
+        if not title or len(title) < 5 or len(title) > 500:
+            return False
+        
+        # Kontrolli autoreid
+        authors = metadata.get('authors', [])
+        if not authors:
+            return False
+        
+        # Kontrolli, et autorid ei oleks aadressid või muud jama
+        for author in authors:
+            if (len(author) > 100 or 
+                any(char.isdigit() for char in author) or
+                '@' in author or
+                'university' in author.lower() or
+                'institute' in author.lower()):
+                return False
+        
+        # Kontrolli aastat
+        year = str(metadata.get('year', ''))
+        if not year.isdigit() or not (1900 <= int(year) <= 2025):
+            return False
+        
+        return True

+ 132 - 31
src/pdf_processor.py

@@ -137,7 +137,7 @@ class PDFProcessor:
         
         return full_text, sections
     
-    def extract_structured_metadata(self, text: str) -> Dict:
+    def extract_structured_metadata(self, filepath: str, text: str) -> Dict:
         """Proovi eraldada struktureeritud metainfo tekstist"""
         metadata = {
             'title': '',
@@ -148,53 +148,149 @@ class PDFProcessor:
             'keywords': []
         }
         
-        # Otsi pealkirja (esimene suurem rida)
+        # Proovi kõigepealt PDF sisemisest metadata-st (kui on)
+        if filepath:
+            try:
+                with open(filepath, 'rb') as file:
+                    pdf_reader = PyPDF2.PdfReader(file)
+                    pdf_meta = pdf_reader.metadata
+                    
+                    if pdf_meta:
+                        if pdf_meta.get('/Title') and pdf_meta['/Title'].strip():
+                            metadata['title'] = pdf_meta['/Title'].strip()
+                        if pdf_meta.get('/Author') and pdf_meta['/Author'].strip():
+                            authors = pdf_meta['/Author'].split(';')
+                            metadata['authors'] = [a.strip() for a in authors]
+            except Exception:
+                self.logger.warning(f"PDF metadata lugemine ebaõnnestus: {filepath}")
+        
+        # Otsi pealkirja (esimene suurem rida, mis ei ole liiga lühike ega pikk)
         lines = text.split('\n')
-        for line in lines:
+        
+        # Eemalda tühjad read
+        lines = [line.strip() for line in lines if line.strip()]
+        
+        for i, line in enumerate(lines):
             line = line.strip()
-            if len(line) > 20 and len(line) < 200 and not line.startswith('http'):
-                if not metadata['title'] and line[0].isupper():
-                    metadata['title'] = line
+            # Heuristika pealkirja tuvastamiseks
+            if (len(line) > 15 and len(line) < 200 and 
+                not line.startswith('http') and
+                not line.startswith('DOI:') and
+                not line.startswith('doi:') and
+                not 'abstract' in line.lower() and
+                not 'keyword' in line.lower() and
+                not 'introduction' in line.lower() and
+                not '©' in line and
+                not 'corresponding author' in line.lower() and
+                not '@' in line and
+                not line[0].isdigit() and  # Ei alga numbriga
+                not re.match(r'^[\d\s]+$', line) and  # Ei ole ainult numbrid ja tühikud
+                not re.match(r'^[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*$', line) and  # Ei ole autorinimi
+                not re.match(r'^\w+@\w+\.\w+$', line) and  # Ei ole email
+                not re.match(r'^tel(\.|ephone)?:\s*\+?[\d\s\-]+$', line, re.IGNORECASE) and  # Ei ole telefon
+                not re.match(r'^fax:\s*\+?[\d\s\-]+$', line, re.IGNORECASE) and  # Ei ole fax
+                not re.match(r'^\d{4}\s*$', line) and  # Ei ole aastaarv
+                not metadata['title']):  # Kui pole veel pealkirja
+                
+                metadata['title'] = line
                 break
         
-        # Otsi autoreid (tüüpiline muster)
-        for i, line in enumerate(lines):
-            if 'author' in line.lower() or 'authors' in line.lower():
-                # Proovi järgmised 3 rida
-                for j in range(1, 4):
-                    if i + j < len(lines):
-                        author_line = lines[i + j].strip()
-                        if author_line and len(author_line) < 300:
-                            # Eralda nimed komade või 'and' järgi
-                            authors = re.split(r',|\band\b|;', author_line)
-                            metadata['authors'] = [a.strip() for a in authors if a.strip()]
-                            break
+        # Otsi autoreid (tüüpilised mustrid)
+        author_patterns = [
+            r'^\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s*,\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)+(?:\s*,\s*and\s+[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)*)\s*$',
+            r'^\s*([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*(?:\s*&\s*[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)+)\s*$',
+            r'^\s*([A-Z]\.\s+[A-Z][a-z]+(?:\s*,\s*[A-Z]\.\s+[A-Z][a-z]+)*)\s*$'
+        ]
+        
+        # Otsi autorrid pealkirja ja abstrakti vahelt
+        for i in range(min(20, len(lines))):  # Vaata esimesi 20 rida
+            line = lines[i]
+            
+            # Kontrolli autorimustreid
+            for pattern in author_patterns:
+                match = re.match(pattern, line)
+                if match and len(line) < 200:  # Liiga pikk ei saa olla autor
+                    # Eralda nimed
+                    if '&' in line:
+                        authors = [a.strip() for a in line.split('&')]
+                    elif ' and ' in line:
+                        parts = line.split(' and ')
+                        authors = []
+                        for part in parts:
+                            if ',' in part:
+                                authors.extend([a.strip() for a in part.split(',')])
+                            else:
+                                authors.append(part.strip())
+                    elif ',' in line:
+                        authors = [a.strip() for a in line.split(',')]
+                    else:
+                        authors = [line.strip()]
+                    
+                    # Filtreeri ebareaalsed autorid
+                    authors = [a for a in authors if len(a) > 3 and len(a) < 50 and not any(char.isdigit() for char in a)]
+                    if authors:
+                        metadata['authors'] = authors
+                        break
         
         # Otsi aastat
-        year_pattern = r'\((\d{4})\)|(\d{4})\s*[A-Z]'
-        for line in lines:
-            match = re.search(year_pattern, line)
-            if match:
-                metadata['year'] = match.group(1) or match.group(2)
+        year_patterns = [
+            r'\((\d{4})\)',
+            r'\b(\d{4})\b',
+            r'©\s*(\d{4})',
+            r'\b(19\d{2}|20\d{2})\b'
+        ]
+        
+        for line in lines[:50]:  # Vaata esimesi 50 rida
+            for pattern in year_patterns:
+                matches = re.findall(pattern, line)
+                for match in matches:
+                    if match.isdigit() and 1900 <= int(match) <= 2025:
+                        metadata['year'] = match
+                        break
+            if metadata['year']:
                 break
         
         # Otsi DOI
-        doi_pattern = r'doi:\s*([^\s]+|10\.\d{4,9}/[-._;()/:A-Z0-9]+)'
+        doi_patterns = [
+            r'doi:\s*([^\s]+)',
+            r'DOI:\s*([^\s]+)',
+            r'10\.\d{4,9}/[-._;()/:A-Za-z0-9]+'
+        ]
+        
         for line in lines:
-            match = re.search(doi_pattern, line, re.IGNORECASE)
-            if match:
-                metadata['doi'] = match.group(1)
+            for pattern in doi_patterns:
+                match = re.search(pattern, line, re.IGNORECASE)
+                if match:
+                    doi_text = match.group(1) if 'doi:' not in pattern.lower() else match.group(1).lstrip('doi:').strip()
+                    metadata['doi'] = doi_text
+                    break
+            if metadata['doi']:
+                break
+        
+        # Otsi žurnaali nime
+        journal_indicators = ['Journal of', 'Transportation', 'Transport', 'Geography', 'Science', 'Research', 'Review']
+        for i, line in enumerate(lines[:30]):
+            line_lower = line.lower()
+            if any(indicator in line for indicator in journal_indicators) and len(line) < 100:
+                # Vaata, kas järgnevad read on seotud
+                if i + 1 < len(lines) and lines[i + 1]:
+                    if len(lines[i + 1]) < 100 and not any(char.isdigit() for char in lines[i + 1]):
+                        metadata['journal'] = f"{line} {lines[i + 1]}".strip()
+                else:
+                    metadata['journal'] = line.strip()
                 break
         
         # Otsi võtmesõnu
-        for line in lines:
+        for i, line in enumerate(lines):
             if 'keyword' in line.lower():
                 # Proovi järgmised read
                 for j in range(1, 3):
                     if i + j < len(lines):
                         kw_line = lines[i + j].strip()
                         if kw_line:
-                            metadata['keywords'] = [k.strip() for k in re.split(r',|;', kw_line)]
+                            # Eralda komade või semikoolonitega
+                            keywords = re.split(r',|;', kw_line)
+                            metadata['keywords'] = [k.strip() for k in keywords if k.strip()]
                             break
         
         return metadata
@@ -209,19 +305,24 @@ class PDFProcessor:
             full_text, sections = self.extract_text_from_pdf(filepath)
             
             # Eralda struktureeritud metainfo
-            structured_meta = self.extract_structured_metadata(full_text)
+            structured_meta = self.extract_structured_metadata(filepath,full_text)
             
             # Ühenda kõik andmed
             result = {
                 'pdf_metadata': asdict(metadata),
                 'structured_metadata': structured_meta,
-                'full_text': full_text[:5000],  # Säästa mälu, salvesta ainult algus
+                'full_text': full_text[:8000],  # Säästa mälu, salvesta ainult algus (suurendatud)
                 'sections': [asdict(s) for s in sections],
                 'processing_date': datetime.now().isoformat(),
                 'word_count': len(full_text.split())
             }
             
             self.logger.info(f"PDF töödeldud: {metadata.filename}")
+            self.logger.info(f"  Pealkiri: {structured_meta.get('title', 'Teadmata')}")
+            self.logger.info(f"  Autorid: {structured_meta.get('authors', [])}")
+            self.logger.info(f"  Aasta: {structured_meta.get('year', 'Teadmata')}")
+            self.logger.info(f"  DOI: {structured_meta.get('doi', 'Teadmata')}")
+            
             return result
             
         except Exception as e:

+ 62 - 22
src/pipeline.py

@@ -10,6 +10,7 @@ from .pdf_processor import PDFProcessor
 from .deepseek_client import DeepSeekClient
 from .embedding_generator import EmbeddingGenerator
 from .weaviate_client import WeaviateClient
+from .metadata_enhancer import MetadataEnhancer
 from .config import config
 from .utils import setup_logging, save_processed_article
 
@@ -23,12 +24,14 @@ class ArticleProcessingPipeline:
         self.deepseek_client = DeepSeekClient()
         self.embedding_generator = EmbeddingGenerator()
         self.weaviate_client = WeaviateClient()
+        self.metadata_enhancer = MetadataEnhancer()
         
         self.stats = {
             'processed': 0,
             'saved': 0,
             'skipped': 0,
-            'errors': 0
+            'errors': 0,
+            'metadata_enhanced': 0
         }
     
     def process_single_article(self, pdf_path: str) -> Optional[Dict]:
@@ -39,51 +42,79 @@ class ArticleProcessingPipeline:
             # 1. PDF töötlus
             pdf_data = self.pdf_processor.process_pdf(pdf_path)
             
-            # 2. Eralda abstrakt (kui leiad)
+            # 2. Täiusta metainfot AI-ga
+            structured_metadata = pdf_data['structured_metadata']
+            
+            # Kontrolli, kas algne metadata on usaldusväärne
+            if not self.metadata_enhancer.is_metadata_valid(structured_metadata):
+                logger.info(f"Algne metadata ebausaldusväärne, täiustan AI-ga")
+                
+                # Proovi kõigepealt täiustada olemasolevat
+                enhanced_metadata = self.metadata_enhancer.enhance_metadata_with_ai(
+                    text=pdf_data['full_text'],
+                    current_metadata=structured_metadata
+                )
+                
+                # Kui ikka ei ole usaldusväärne, otsi otse tekstist
+                if not self.metadata_enhancer.is_metadata_valid(enhanced_metadata):
+                    logger.info(f"AI täiustamine ebaõnnestus, otsin otse tekstist")
+                    enhanced_metadata = self.metadata_enhancer.extract_metadata_directly(
+                        pdf_data['full_text']
+                    )
+                    self.stats['metadata_enhanced'] += 1
+                
+                # Kasuta täiustatud metadata't
+                if enhanced_metadata:
+                    # Säilita väärtused, mida AI ei täiustanud
+                    for key in ['title', 'authors', 'year', 'journal', 'doi']:
+                        if key in enhanced_metadata and enhanced_metadata[key]:
+                            structured_metadata[key] = enhanced_metadata[key]
+            
+            # 3. Eralda abstrakt (kui leiad)
             abstract_en = ""
             for section in pdf_data['sections']:
                 if section['section_type'] == 'abstract':
                     abstract_en = section['content']
                     break
             
-            # 3. DeepSeek analüüs
+            # 4. DeepSeek analüüs
             summary_data = self.deepseek_client.create_summary(
                 text=pdf_data['full_text'],
                 context={
-                    'title': pdf_data['structured_metadata'].get('title', ''),
-                    'authors': pdf_data['structured_metadata'].get('authors', []),
-                    'year': pdf_data['structured_metadata'].get('year', ''),
-                    'journal': pdf_data['structured_metadata'].get('journal', '')
+                    'title': structured_metadata.get('title', ''),
+                    'authors': structured_metadata.get('authors', []),
+                    'year': structured_metadata.get('year', ''),
+                    'journal': structured_metadata.get('journal', '')
                 }
             )
             
-            # 4. Eralda võtmesõnad
+            # 5. Eralda võtmesõnad
             key_concepts = self.deepseek_client.extract_key_concepts(
                 text=pdf_data['full_text'],
                 summary=summary_data['summary_et']
             )
             
-            # 5. Tuvasta meetodid
+            # 6. Tuvasta meetodid
             methods_used = self.deepseek_client.identify_methods(pdf_data['full_text'])
             
-            # 6. Analüüsi transpordi kontekst
+            # 7. Analüüsi transpordi kontekst
             transport_context = self.deepseek_client.analyze_transport_context(
                 summary_data['summary_et']
             )
             
-            # 7. Koosta lõplik artikkel
+            # 8. Koosta lõplik artikkel
             article_data = {
                 # PDF metainfo
                 'pdf_metadata': pdf_data['pdf_metadata'],
                 'source_file': pdf_path,
                 'file_hash': pdf_data['pdf_metadata']['file_hash'],
                 
-                # Struktureeritud metainfo
-                'title': pdf_data['structured_metadata'].get('title', ''),
-                'authors': pdf_data['structured_metadata'].get('authors', []),
-                'year': pdf_data['structured_metadata'].get('year', ''),
-                'journal': pdf_data['structured_metadata'].get('journal', ''),
-                'doi': pdf_data['structured_metadata'].get('doi', ''),
+                # Struktureeritud metainfo (AI-ga täiustatud)
+                'title': structured_metadata.get('title', ''),
+                'authors': structured_metadata.get('authors', []),
+                'year': structured_metadata.get('year', ''),
+                'journal': structured_metadata.get('journal', ''),
+                'doi': structured_metadata.get('doi', ''),
                 
                 # Tekstisisu
                 'abstract_en': abstract_en,
@@ -101,13 +132,18 @@ class ArticleProcessingPipeline:
                 'section_count': len(pdf_data['sections'])
             }
             
-            # 8. Genereeri embeddingud
+            # Logi lõplik metadata
+            logger.info(f"Lõplik metadata: {article_data['title'][:50]}...")
+            logger.info(f"  Autorid: {article_data['authors']}")
+            logger.info(f"  Aasta: {article_data['year']}")
+            
+            # 9. Genereeri embeddingud
             embeddings = self.embedding_generator.generate_article_embeddings(article_data)
             
-            # 9. Salvesta Weaviate'i
+            # 10. Salvesta Weaviate'i
             saved = self.weaviate_client.save_article(article_data, embeddings)
             
-            # 10. Salvesta töödeldud andmed faili
+            # 11. Salvesta töödeldud andmed faili
             processed_data = {
                 'article': article_data,
                 'embeddings_summary': embeddings.get('summary', []),
@@ -115,7 +151,8 @@ class ArticleProcessingPipeline:
                     'pdf_pages': pdf_data['pdf_metadata']['page_count'],
                     'summary_length': summary_data['summary_length'],
                     'concepts_count': len(key_concepts),
-                    'methods_count': len(methods_used)
+                    'methods_count': len(methods_used),
+                    'metadata_enhanced': self.stats['metadata_enhanced']
                 }
             }
             
@@ -135,6 +172,8 @@ class ArticleProcessingPipeline:
         except Exception as e:
             self.stats['errors'] += 1
             logger.error(f"Viga artikli töötlemisel {pdf_path}: {str(e)}")
+            import traceback
+            logger.error(traceback.format_exc())
             return None
     
     def process_batch(self, pdf_files: List[str]) -> List[Dict]:
@@ -147,7 +186,7 @@ class ArticleProcessingPipeline:
                 results.append(result)
             
             # Väike paus, et mitte üle koormata API-d
-            time.sleep(1)
+            time.sleep(1.5)
         
         return results
     
@@ -192,6 +231,7 @@ class ArticleProcessingPipeline:
         logger.info(f"Salvestatud artikleid: {self.stats['saved']}")
         logger.info(f"Vahele jäetud (duplikaadid): {self.stats['skipped']}")
         logger.info(f"Vigaseid töötlusi: {self.stats['errors']}")
+        logger.info(f"AI-ga täiustatud metainfosid: {self.stats['metadata_enhanced']}")
         logger.info("="*50)
     
     def close(self):

Einige Dateien werden nicht angezeigt, da zu viele Dateien in diesem Diff geändert wurden.