|
@@ -10,6 +10,7 @@ from .pdf_processor import PDFProcessor
|
|
|
from .deepseek_client import DeepSeekClient
|
|
from .deepseek_client import DeepSeekClient
|
|
|
from .embedding_generator import EmbeddingGenerator
|
|
from .embedding_generator import EmbeddingGenerator
|
|
|
from .weaviate_client import WeaviateClient
|
|
from .weaviate_client import WeaviateClient
|
|
|
|
|
+from .metadata_enhancer import MetadataEnhancer
|
|
|
from .config import config
|
|
from .config import config
|
|
|
from .utils import setup_logging, save_processed_article
|
|
from .utils import setup_logging, save_processed_article
|
|
|
|
|
|
|
@@ -23,12 +24,14 @@ class ArticleProcessingPipeline:
|
|
|
self.deepseek_client = DeepSeekClient()
|
|
self.deepseek_client = DeepSeekClient()
|
|
|
self.embedding_generator = EmbeddingGenerator()
|
|
self.embedding_generator = EmbeddingGenerator()
|
|
|
self.weaviate_client = WeaviateClient()
|
|
self.weaviate_client = WeaviateClient()
|
|
|
|
|
+ self.metadata_enhancer = MetadataEnhancer()
|
|
|
|
|
|
|
|
self.stats = {
|
|
self.stats = {
|
|
|
'processed': 0,
|
|
'processed': 0,
|
|
|
'saved': 0,
|
|
'saved': 0,
|
|
|
'skipped': 0,
|
|
'skipped': 0,
|
|
|
- 'errors': 0
|
|
|
|
|
|
|
+ 'errors': 0,
|
|
|
|
|
+ 'metadata_enhanced': 0
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
def process_single_article(self, pdf_path: str) -> Optional[Dict]:
|
|
def process_single_article(self, pdf_path: str) -> Optional[Dict]:
|
|
@@ -39,51 +42,79 @@ class ArticleProcessingPipeline:
|
|
|
# 1. PDF töötlus
|
|
# 1. PDF töötlus
|
|
|
pdf_data = self.pdf_processor.process_pdf(pdf_path)
|
|
pdf_data = self.pdf_processor.process_pdf(pdf_path)
|
|
|
|
|
|
|
|
- # 2. Eralda abstrakt (kui leiad)
|
|
|
|
|
|
|
+ # 2. Täiusta metainfot AI-ga
|
|
|
|
|
+ structured_metadata = pdf_data['structured_metadata']
|
|
|
|
|
+
|
|
|
|
|
+ # Kontrolli, kas algne metadata on usaldusväärne
|
|
|
|
|
+ if not self.metadata_enhancer.is_metadata_valid(structured_metadata):
|
|
|
|
|
+ logger.info(f"Algne metadata ebausaldusväärne, täiustan AI-ga")
|
|
|
|
|
+
|
|
|
|
|
+ # Proovi kõigepealt täiustada olemasolevat
|
|
|
|
|
+ enhanced_metadata = self.metadata_enhancer.enhance_metadata_with_ai(
|
|
|
|
|
+ text=pdf_data['full_text'],
|
|
|
|
|
+ current_metadata=structured_metadata
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # Kui ikka ei ole usaldusväärne, otsi otse tekstist
|
|
|
|
|
+ if not self.metadata_enhancer.is_metadata_valid(enhanced_metadata):
|
|
|
|
|
+ logger.info(f"AI täiustamine ebaõnnestus, otsin otse tekstist")
|
|
|
|
|
+ enhanced_metadata = self.metadata_enhancer.extract_metadata_directly(
|
|
|
|
|
+ pdf_data['full_text']
|
|
|
|
|
+ )
|
|
|
|
|
+ self.stats['metadata_enhanced'] += 1
|
|
|
|
|
+
|
|
|
|
|
+ # Kasuta täiustatud metadata't
|
|
|
|
|
+ if enhanced_metadata:
|
|
|
|
|
+ # Säilita väärtused, mida AI ei täiustanud
|
|
|
|
|
+ for key in ['title', 'authors', 'year', 'journal', 'doi']:
|
|
|
|
|
+ if key in enhanced_metadata and enhanced_metadata[key]:
|
|
|
|
|
+ structured_metadata[key] = enhanced_metadata[key]
|
|
|
|
|
+
|
|
|
|
|
+ # 3. Eralda abstrakt (kui leiad)
|
|
|
abstract_en = ""
|
|
abstract_en = ""
|
|
|
for section in pdf_data['sections']:
|
|
for section in pdf_data['sections']:
|
|
|
if section['section_type'] == 'abstract':
|
|
if section['section_type'] == 'abstract':
|
|
|
abstract_en = section['content']
|
|
abstract_en = section['content']
|
|
|
break
|
|
break
|
|
|
|
|
|
|
|
- # 3. DeepSeek analüüs
|
|
|
|
|
|
|
+ # 4. DeepSeek analüüs
|
|
|
summary_data = self.deepseek_client.create_summary(
|
|
summary_data = self.deepseek_client.create_summary(
|
|
|
text=pdf_data['full_text'],
|
|
text=pdf_data['full_text'],
|
|
|
context={
|
|
context={
|
|
|
- 'title': pdf_data['structured_metadata'].get('title', ''),
|
|
|
|
|
- 'authors': pdf_data['structured_metadata'].get('authors', []),
|
|
|
|
|
- 'year': pdf_data['structured_metadata'].get('year', ''),
|
|
|
|
|
- 'journal': pdf_data['structured_metadata'].get('journal', '')
|
|
|
|
|
|
|
+ 'title': structured_metadata.get('title', ''),
|
|
|
|
|
+ 'authors': structured_metadata.get('authors', []),
|
|
|
|
|
+ 'year': structured_metadata.get('year', ''),
|
|
|
|
|
+ 'journal': structured_metadata.get('journal', '')
|
|
|
}
|
|
}
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- # 4. Eralda võtmesõnad
|
|
|
|
|
|
|
+ # 5. Eralda võtmesõnad
|
|
|
key_concepts = self.deepseek_client.extract_key_concepts(
|
|
key_concepts = self.deepseek_client.extract_key_concepts(
|
|
|
text=pdf_data['full_text'],
|
|
text=pdf_data['full_text'],
|
|
|
summary=summary_data['summary_et']
|
|
summary=summary_data['summary_et']
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- # 5. Tuvasta meetodid
|
|
|
|
|
|
|
+ # 6. Tuvasta meetodid
|
|
|
methods_used = self.deepseek_client.identify_methods(pdf_data['full_text'])
|
|
methods_used = self.deepseek_client.identify_methods(pdf_data['full_text'])
|
|
|
|
|
|
|
|
- # 6. Analüüsi transpordi kontekst
|
|
|
|
|
|
|
+ # 7. Analüüsi transpordi kontekst
|
|
|
transport_context = self.deepseek_client.analyze_transport_context(
|
|
transport_context = self.deepseek_client.analyze_transport_context(
|
|
|
summary_data['summary_et']
|
|
summary_data['summary_et']
|
|
|
)
|
|
)
|
|
|
|
|
|
|
|
- # 7. Koosta lõplik artikkel
|
|
|
|
|
|
|
+ # 8. Koosta lõplik artikkel
|
|
|
article_data = {
|
|
article_data = {
|
|
|
# PDF metainfo
|
|
# PDF metainfo
|
|
|
'pdf_metadata': pdf_data['pdf_metadata'],
|
|
'pdf_metadata': pdf_data['pdf_metadata'],
|
|
|
'source_file': pdf_path,
|
|
'source_file': pdf_path,
|
|
|
'file_hash': pdf_data['pdf_metadata']['file_hash'],
|
|
'file_hash': pdf_data['pdf_metadata']['file_hash'],
|
|
|
|
|
|
|
|
- # Struktureeritud metainfo
|
|
|
|
|
- 'title': pdf_data['structured_metadata'].get('title', ''),
|
|
|
|
|
- 'authors': pdf_data['structured_metadata'].get('authors', []),
|
|
|
|
|
- 'year': pdf_data['structured_metadata'].get('year', ''),
|
|
|
|
|
- 'journal': pdf_data['structured_metadata'].get('journal', ''),
|
|
|
|
|
- 'doi': pdf_data['structured_metadata'].get('doi', ''),
|
|
|
|
|
|
|
+ # Struktureeritud metainfo (AI-ga täiustatud)
|
|
|
|
|
+ 'title': structured_metadata.get('title', ''),
|
|
|
|
|
+ 'authors': structured_metadata.get('authors', []),
|
|
|
|
|
+ 'year': structured_metadata.get('year', ''),
|
|
|
|
|
+ 'journal': structured_metadata.get('journal', ''),
|
|
|
|
|
+ 'doi': structured_metadata.get('doi', ''),
|
|
|
|
|
|
|
|
# Tekstisisu
|
|
# Tekstisisu
|
|
|
'abstract_en': abstract_en,
|
|
'abstract_en': abstract_en,
|
|
@@ -101,13 +132,18 @@ class ArticleProcessingPipeline:
|
|
|
'section_count': len(pdf_data['sections'])
|
|
'section_count': len(pdf_data['sections'])
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
- # 8. Genereeri embeddingud
|
|
|
|
|
|
|
+ # Logi lõplik metadata
|
|
|
|
|
+ logger.info(f"Lõplik metadata: {article_data['title'][:50]}...")
|
|
|
|
|
+ logger.info(f" Autorid: {article_data['authors']}")
|
|
|
|
|
+ logger.info(f" Aasta: {article_data['year']}")
|
|
|
|
|
+
|
|
|
|
|
+ # 9. Genereeri embeddingud
|
|
|
embeddings = self.embedding_generator.generate_article_embeddings(article_data)
|
|
embeddings = self.embedding_generator.generate_article_embeddings(article_data)
|
|
|
|
|
|
|
|
- # 9. Salvesta Weaviate'i
|
|
|
|
|
|
|
+ # 10. Salvesta Weaviate'i
|
|
|
saved = self.weaviate_client.save_article(article_data, embeddings)
|
|
saved = self.weaviate_client.save_article(article_data, embeddings)
|
|
|
|
|
|
|
|
- # 10. Salvesta töödeldud andmed faili
|
|
|
|
|
|
|
+ # 11. Salvesta töödeldud andmed faili
|
|
|
processed_data = {
|
|
processed_data = {
|
|
|
'article': article_data,
|
|
'article': article_data,
|
|
|
'embeddings_summary': embeddings.get('summary', []),
|
|
'embeddings_summary': embeddings.get('summary', []),
|
|
@@ -115,7 +151,8 @@ class ArticleProcessingPipeline:
|
|
|
'pdf_pages': pdf_data['pdf_metadata']['page_count'],
|
|
'pdf_pages': pdf_data['pdf_metadata']['page_count'],
|
|
|
'summary_length': summary_data['summary_length'],
|
|
'summary_length': summary_data['summary_length'],
|
|
|
'concepts_count': len(key_concepts),
|
|
'concepts_count': len(key_concepts),
|
|
|
- 'methods_count': len(methods_used)
|
|
|
|
|
|
|
+ 'methods_count': len(methods_used),
|
|
|
|
|
+ 'metadata_enhanced': self.stats['metadata_enhanced']
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
@@ -135,6 +172,8 @@ class ArticleProcessingPipeline:
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
self.stats['errors'] += 1
|
|
self.stats['errors'] += 1
|
|
|
logger.error(f"Viga artikli töötlemisel {pdf_path}: {str(e)}")
|
|
logger.error(f"Viga artikli töötlemisel {pdf_path}: {str(e)}")
|
|
|
|
|
+ import traceback
|
|
|
|
|
+ logger.error(traceback.format_exc())
|
|
|
return None
|
|
return None
|
|
|
|
|
|
|
|
def process_batch(self, pdf_files: List[str]) -> List[Dict]:
|
|
def process_batch(self, pdf_files: List[str]) -> List[Dict]:
|
|
@@ -147,7 +186,7 @@ class ArticleProcessingPipeline:
|
|
|
results.append(result)
|
|
results.append(result)
|
|
|
|
|
|
|
|
# Väike paus, et mitte üle koormata API-d
|
|
# Väike paus, et mitte üle koormata API-d
|
|
|
- time.sleep(1)
|
|
|
|
|
|
|
+ time.sleep(1.5)
|
|
|
|
|
|
|
|
return results
|
|
return results
|
|
|
|
|
|
|
@@ -192,6 +231,7 @@ class ArticleProcessingPipeline:
|
|
|
logger.info(f"Salvestatud artikleid: {self.stats['saved']}")
|
|
logger.info(f"Salvestatud artikleid: {self.stats['saved']}")
|
|
|
logger.info(f"Vahele jäetud (duplikaadid): {self.stats['skipped']}")
|
|
logger.info(f"Vahele jäetud (duplikaadid): {self.stats['skipped']}")
|
|
|
logger.info(f"Vigaseid töötlusi: {self.stats['errors']}")
|
|
logger.info(f"Vigaseid töötlusi: {self.stats['errors']}")
|
|
|
|
|
+ logger.info(f"AI-ga täiustatud metainfosid: {self.stats['metadata_enhanced']}")
|
|
|
logger.info("="*50)
|
|
logger.info("="*50)
|
|
|
|
|
|
|
|
def close(self):
|
|
def close(self):
|