| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- import re
- import json
- from typing import Dict, List, Optional
- import logging
- from .deepseek_client import DeepSeekClient
- from .config import config
- logger = logging.getLogger(__name__)
- class MetadataEnhancer:
- """Metadata täiustamine DeepSeeki abil"""
-
- def __init__(self):
- self.deepseek_client = DeepSeekClient()
-
- def enhance_metadata_with_ai(self, text: str, current_metadata: Dict) -> Dict:
- """
- Täiusta metainfot DeepSeeki abil
-
- Args:
- text: Artikli tekst (esimesed ~4000 märki)
- current_metadata: Olemasolev metadata
-
- Returns:
- Täiustatud metadata
- """
- logger.info("Täiustan metainfot DeepSeeki abil...")
-
- system_prompt = """Sa oled teadusartiklite metainfo spetsialist.
- Sinu ülesanne on tuvastada antud teadusartikli õige pealkiri, autorid, avaldamisaasta,
- žurnaal ja DOI.
- Tagasta vastus JSON formaadis:
- {
- "title": "õige pealkiri",
- "authors": ["autor1", "autor2", ...],
- "year": "avaldamisaasta",
- "journal": "žurnaal/konverentsi nimetus",
- "doi": "DOI identifikaator"
- }
- Kui mõni väli on tuvastamata, jäta see tühjaks.
- Auta valesti tuvastatud väärtusi parandada.
- """
-
- user_prompt = f"""Tuvasta järgmise teadusartikli metainfo:
- CURRENT METADATA:
- - Pealkiri: {current_metadata.get('title', 'Teadmata')}
- - Autorid: {current_metadata.get('authors', [])}
- - Aasta: {current_metadata.get('year', 'Teadmata')}
- - Žurnaal: {current_metadata.get('journal', 'Teadmata')}
- - DOI: {current_metadata.get('doi', 'Teadmata')}
- ARTIKLI TEKST (esimesed 4000 märki):
- {text[:4000]}
- Palun analüüsi artiklit ja paranda või täienda metainfot. Tagasta VAID JSON.
- """
-
- messages = [
- {"role": "system", "content": system_prompt},
- {"role": "user", "content": user_prompt}
- ]
-
- try:
- response = self.deepseek_client.call_api(messages, temperature=0.3)
-
- # Proovi parsida JSON vastust
- if response:
- # Otsi JSON blokki tekstist
- json_match = re.search(r'\{.*\}', response, re.DOTALL)
- if json_match:
- json_str = json_match.group(0)
- try:
- enhanced_data = json.loads(json_str)
-
- # Valideeri ja puhasta andmed
- enhanced_data = self._clean_enhanced_metadata(enhanced_data, current_metadata)
-
- logger.info(f"Metainfo täiustatud AI-ga")
- return enhanced_data
- except json.JSONDecodeError as e:
- logger.error(f"JSON parsimise viga: {e}")
- else:
- logger.error(f"Ei leidnud JSON-i vastuses: {response[:200]}")
- except Exception as e:
- logger.error(f"Viga AI metainfo täiustamisel: {e}")
-
- # Kui AI ei tööta, tagasta algne
- return current_metadata
-
- def _clean_enhanced_metadata(self, enhanced_data: Dict, original_data: Dict) -> Dict:
- """Puhasta ja valideeri täiustatud metadata"""
- cleaned = {}
-
- # Pealkiri
- title = enhanced_data.get('title', '').strip()
- if (title and
- len(title) > 10 and len(title) < 500 and
- not any(bad in title.lower() for bad in ['abstract', 'keywords', 'introduction', 'contents'])):
- cleaned['title'] = title
- else:
- cleaned['title'] = original_data.get('title', '')
-
- # Autorid
- authors = enhanced_data.get('authors', [])
- if isinstance(authors, list):
- cleaned_authors = []
- for author in authors:
- if isinstance(author, str):
- author_clean = author.strip()
- # Eemalda ebareaalsed autorid
- if (len(author_clean) > 2 and len(author_clean) < 100 and
- not any(char.isdigit() for char in author_clean) and
- not '@' in author_clean and
- not 'university' in author_clean.lower() and
- not 'institute' in author_clean.lower()):
- cleaned_authors.append(author_clean)
-
- if cleaned_authors:
- cleaned['authors'] = cleaned_authors
- else:
- cleaned['authors'] = original_data.get('authors', [])
- else:
- cleaned['authors'] = original_data.get('authors', [])
-
- # Aasta
- year = str(enhanced_data.get('year', '')).strip()
- if year.isdigit() and 1900 <= int(year) <= 2025:
- cleaned['year'] = year
- else:
- cleaned['year'] = original_data.get('year', '')
-
- # Žurnaal
- journal = enhanced_data.get('journal', '').strip()
- if journal and len(journal) < 200:
- cleaned['journal'] = journal
- else:
- cleaned['journal'] = original_data.get('journal', '')
-
- # DOI
- doi = enhanced_data.get('doi', '').strip()
- if doi and (doi.startswith('10.') or 'doi.org' in doi):
- cleaned['doi'] = doi
- else:
- cleaned['doi'] = original_data.get('doi', '')
-
- return cleaned
-
- def extract_metadata_directly(self, text: str) -> Dict:
- """
- Otsi metainfot otse tekstist ilma kontekstita
- Kasulik, kui algne metadata on täiesti valesti
- """
- logger.info("Otsin metainfot otse tekstist...")
-
- system_prompt = """Otsi antud teadusartikli tekstist pealkirja, autoreid,
- avaldamisaastat, žurnaali ja DOI-d. Tagasta tulemus JSON formaadis.
- """
-
- user_prompt = f"""Artikli tekst (esimesed 3000 märki):
- {text[:3000]}
- Palun otsi metainfot. Tagasta VAID JSON.
- """
-
- messages = [
- {"role": "system", "content": system_prompt},
- {"role": "user", "content": user_prompt}
- ]
-
- try:
- response = self.deepseek_client.call_api(messages, temperature=0.3)
-
- if response:
- json_match = re.search(r'\{.*\}', response, re.DOTALL)
- if json_match:
- json_str = json_match.group(0)
- try:
- metadata = json.loads(json_str)
- return self._clean_enhanced_metadata(metadata, {})
- except:
- pass
- except Exception as e:
- logger.error(f"Viga otse metainfo eraldamisel: {e}")
-
- return {}
-
- def is_metadata_valid(self, metadata: Dict) -> bool:
- """Kontrolli, kas metadata on usaldusväärne"""
- # Kontrolli pealkirja
- title = metadata.get('title', '')
- if not title or len(title) < 5 or len(title) > 500:
- return False
-
- # Kontrolli autoreid
- authors = metadata.get('authors', [])
- if not authors:
- return False
-
- # Kontrolli, et autorid ei oleks aadressid või muud jama
- for author in authors:
- if (len(author) > 100 or
- any(char.isdigit() for char in author) or
- '@' in author or
- 'university' in author.lower() or
- 'institute' in author.lower()):
- return False
-
- # Kontrolli aastat
- year = str(metadata.get('year', ''))
- if not year.isdigit() or not (1900 <= int(year) <= 2025):
- return False
-
- return True
|