# save_articles_to_pdf.py
import os
import sys
import re
from datetime import datetime
from reportlab.lib.pagesizes import letter, A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER, TA_LEFT
from reportlab.lib import colors
from reportlab.lib.units import inch, cm
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
import json
# Lisa src kaust Pythoni teele
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
from src.weaviate_client import WeaviateClient
def clean_html_tags(text):
"""Puhasta tekst HTML/XML siltidest ReportLab-i jaoks"""
if not text:
return ""
# Eemalda kõik HTML/XML siltid
text = re.sub(r'<[^>]+>', '', text)
# Asenda erimärgid ReportLab-ile sobivate märkidega
replacements = {
' ': ' ',
'&': '&',
'<': '<',
'>': '>',
'"': '"',
''': "'",
''': "'",
'\u00a0': ' ', # mitte-tühik
'\u2026': '...', # ellipsis
'\u2013': '-', # n-sild
'\u2014': '-', # m-sild
'\u2018': "'", # vasak ülakoma
'\u2019': "'", # parem ülakoma
'\u201c': '"', # vasak jutumärk
'\u201d': '"', # parem jutumärk
}
for old, new in replacements.items():
text = text.replace(old, new)
return text
def clean_markdown_for_pdf(text):
"""Konverteeri markdown ReportLab-ile sobivaks tekstiks"""
if not text:
return ""
# Kui ei ole string, konverteeri stringiks
if not isinstance(text, str):
text = str(text)
# Eemalda HTML siltid
text = clean_html_tags(text)
# Asenda markdown pealkirjad
text = re.sub(r'#{1,6}\s+', '', text) # Eemalda # pealkirjad
text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # Asenda **bold** lihtsalt tekstiga
text = re.sub(r'\*(.+?)\*', r'\1', text) # Asenda *italic* lihtsalt tekstiga
text = re.sub(r'__(.+?)__', r'\1', text) # Asenda __underline__ lihtsalt tekstiga
text = re.sub(r'~~(.+?)~~', r'\1', text) # Asenda ~~strikethrough~~ lihtsalt tekstiga
# Asenda loetelud (PARANDATUD: kasuta \\1 mitte \1)
text = re.sub(r'^\s*[-*+]\s+', '• ', text, flags=re.MULTILINE)
text = re.sub(r'^\s*(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE) # PARANDATUD
# Asenda koodiblokid
text = re.sub(r'```[^`]+```', '', text) # Eemalda koodiblokid
text = re.sub(r'`([^`]+)`', r'[\1]', text) # Asenda inline kood
# Asenda lingid
text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) # Eemalda lingid
# Eemalda liigsed tühikud
text = re.sub(r'\s+', ' ', text)
return text.strip()
def parse_transport_context(context_data):
"""Parsi transpordikonteksti JSON-ist loetavaks"""
if isinstance(context_data, str):
# Proovi parsida string JSON-iks
try:
return json.loads(context_data)
except json.JSONDecodeError:
# Kui ei õnnestu, tagasta puhastatud string
return {"raw_analysis": clean_markdown_for_pdf(context_data)}
elif isinstance(context_data, dict):
# Puhasta kõik stringiväljad
cleaned = {}
for key, value in context_data.items():
if isinstance(value, str):
cleaned[key] = clean_markdown_for_pdf(value)
else:
cleaned[key] = value
return cleaned
return context_data
def format_context_for_pdf(parsed_context):
"""Vorminda parsitud kontekst PDF-i jaoks"""
if isinstance(parsed_context, dict):
formatted = []
if 'theoretical_contribution' in parsed_context and parsed_context['theoretical_contribution']:
formatted.append(f"TEOREETILINE PANUS: {parsed_context['theoretical_contribution']}")
if 'practical_applicability' in parsed_context and parsed_context['practical_applicability']:
formatted.append(f"PRAKTILINE RAKENDATAVUS: {parsed_context['practical_applicability']}")
if 'problem_solving' in parsed_context and parsed_context['problem_solving']:
formatted.append(f"PROBLEEMILAHDUS: {parsed_context['problem_solving']}")
if 'limitations' in parsed_context and parsed_context['limitations']:
formatted.append(f"PIIRANGUD: {parsed_context['limitations']}")
if 'relevance_score' in parsed_context:
formatted.append(f"RELEVANTSUSE SKOOR: {parsed_context['relevance_score']}/10")
if 'analysis' in parsed_context and parsed_context['analysis']:
formatted.append(f"ANALÜÜS: {parsed_context['analysis']}")
if 'raw_analysis' in parsed_context and parsed_context['raw_analysis']:
formatted.append(f"ANALÜÜS: {parsed_context['raw_analysis']}")
return "\n\n".join(formatted)
else:
return clean_markdown_for_pdf(str(parsed_context))
def get_all_articles_from_weaviate():
"""Toob kõik artiklid Weaviate'ist"""
client = WeaviateClient()
articles = []
try:
collection = client.client.collections.get("ScientificArticle")
# Loendi kokku
count_response = collection.aggregate.over_all(total_count=True)
total = count_response.total_count
print(f"Weaviate'is leidsin {total} artiklit")
if total > 0:
# Toob kõik artiklid
response = collection.query.fetch_objects(limit=total)
for obj in response.objects:
try:
article = {
'article_id': obj.properties.get('article_id', 'N/A'),
'title': clean_markdown_for_pdf(obj.properties.get('title', 'N/A')),
'authors': obj.properties.get('authors', []),
'year': obj.properties.get('year', 'N/A'),
'journal': clean_markdown_for_pdf(obj.properties.get('journal', 'N/A')),
'doi': obj.properties.get('doi', ''),
'abstract_en': clean_markdown_for_pdf(obj.properties.get('abstract_en', '')),
'summary_et': clean_markdown_for_pdf(obj.properties.get('summary_et', '')),
'key_concepts': [clean_markdown_for_pdf(c) for c in obj.properties.get('key_concepts', [])],
'methods_used': [clean_markdown_for_pdf(m) for m in obj.properties.get('methods_used', [])],
'transport_context': parse_transport_context(obj.properties.get('transport_context', {})),
'relevance_score': obj.properties.get('relevance_score', 'N/A'),
'processing_date': obj.properties.get('processing_date', ''),
'source_file': obj.properties.get('source_file', '')
}
articles.append(article)
except Exception as e:
print(f" Viga artikli {obj.properties.get('article_id', 'unknown')} töötlemisel: {e}")
# Lisa artikel ilma puhastuseta
article = {
'article_id': obj.properties.get('article_id', 'N/A'),
'title': str(obj.properties.get('title', 'N/A')),
'authors': obj.properties.get('authors', []),
'year': obj.properties.get('year', 'N/A'),
'journal': str(obj.properties.get('journal', 'N/A')),
'doi': obj.properties.get('doi', ''),
'abstract_en': str(obj.properties.get('abstract_en', '')),
'summary_et': str(obj.properties.get('summary_et', '')),
'key_concepts': [str(c) for c in obj.properties.get('key_concepts', [])],
'methods_used': [str(m) for m in obj.properties.get('methods_used', [])],
'transport_context': str(obj.properties.get('transport_context', {})),
'relevance_score': obj.properties.get('relevance_score', 'N/A'),
'processing_date': obj.properties.get('processing_date', ''),
'source_file': obj.properties.get('source_file', '')
}
articles.append(article)
except Exception as e:
print(f"Viga artiklite toomisel: {e}")
import traceback
traceback.print_exc()
finally:
client.close()
return articles
def format_summary_for_pdf(summary):
"""Vorminda kokkuvõte PDF-ile sobivaks"""
if not summary:
return ""
# Kui ei ole string, konverteeri
if not isinstance(summary, str):
summary = str(summary)
# Eemalda kõik vormindus ja tee lihtsaks tekstiks
summary = clean_markdown_for_pdf(summary)
# Lisa uued read peamiste sektsioonide ette
summary = summary.replace('1. ARTIKLI PEAMISED PUNKTID:', '\n1. ARTIKLI PEAMISED PUNKTID:\n')
summary = summary.replace('2. KASUTATUD MEETODID:', '\n\n2. KASUTATUD MEETODID:\n')
summary = summary.replace('3. PEAMISED TULEMUSED:', '\n\n3. PEAMISED TULEMUSED:\n')
summary = summary.replace('4. JÄRELDUSED JA SOOVITUSED:', '\n\n4. JÄRELDUSED JA SOOVITUSED:\n')
summary = summary.replace('5. TRANSFORDIPLANEERIMISE KONTEKST:', '\n\n5. TRANSFORDIPLANEERIMISE KONTEKST:\n')
# Asenda liigsed reavahetused
summary = re.sub(r'\n{3,}', '\n\n', summary)
# Lõika liiga pikk tekst
if len(summary) > 4000:
summary = summary[:4000] + "... [kokkuvõte lõigatud, liiga pikk]"
return summary
def create_pdf_from_articles(articles, output_filename):
"""Loob PDF faili artiklitest"""
# Loo PDF dokument
doc = SimpleDocTemplate(
output_filename,
pagesize=A4,
rightMargin=72,
leftMargin=72,
topMargin=72,
bottomMargin=72
)
# Stiilide loomine
styles = getSampleStyleSheet()
# Kohandatud stiilid
title_style = ParagraphStyle(
'CustomTitle',
parent=styles['Heading1'],
fontSize=14,
spaceAfter=12,
textColor=colors.HexColor('#2c3e50'),
alignment=TA_LEFT
)
subtitle_style = ParagraphStyle(
'CustomSubtitle',
parent=styles['Heading2'],
fontSize=12,
spaceAfter=6,
textColor=colors.HexColor('#34495e'),
alignment=TA_LEFT
)
section_style = ParagraphStyle(
'CustomSection',
parent=styles['Heading3'],
fontSize=11,
spaceAfter=6,
spaceBefore=12,
textColor=colors.HexColor('#7f8c8d'),
alignment=TA_LEFT
)
normal_style = ParagraphStyle(
'CustomNormal',
parent=styles['Normal'],
fontSize=10,
spaceAfter=6,
alignment=TA_JUSTIFY,
leading=14 # Reavahe
)
metadata_style = ParagraphStyle(
'CustomMetadata',
parent=styles['Normal'],
fontSize=9,
spaceAfter=3,
textColor=colors.HexColor('#5d6d7e'),
alignment=TA_LEFT
)
# Elementide kogumine
elements = []
# Pealkiri ja kokkuvõte
elements.append(Paragraph("TEADUSARTIKLITE ANDMEBAAS", title_style))
elements.append(Spacer(1, 12))
today = datetime.now().strftime("%d.%m.%Y %H:%M")
elements.append(Paragraph(f"Eksporditud: {today}", metadata_style))
elements.append(Paragraph(f"Artikleid kokku: {len(articles)}", metadata_style))
elements.append(Spacer(1, 24))
# Iga artikli jaoks
for i, article in enumerate(articles):
# Artikli pealkiri
elements.append(Paragraph(f"{i+1}. {article['title']}", title_style))
# Autorid
if article['authors']:
authors_text = ", ".join(article['authors'])
elements.append(Paragraph(f"Autorid: {authors_text}", subtitle_style))
# Metaandmed tabelina
metadata_data = []
if article['year'] and article['year'] != 'N/A':
metadata_data.append(['Aasta:', str(article['year'])])
if article['journal'] and article['journal'] != 'N/A':
metadata_data.append(['Žurnaal:', article['journal']])
if article['doi']:
metadata_data.append(['DOI:', article['doi']])
if article['relevance_score'] and article['relevance_score'] != 'N/A':
metadata_data.append(['Relevantsus:', f"{article['relevance_score']}/10"])
if metadata_data:
metadata_table = Table(metadata_data, colWidths=[2*cm, 12*cm])
metadata_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
('FONTSIZE', (0, 0), (-1, -1), 9),
('BOTTOMPADDING', (0, 0), (-1, -1), 6),
('TOPPADDING', (0, 0), (-1, -1), 6),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('LEFTPADDING', (0, 0), (0, -1), 0),
]))
elements.append(metadata_table)
elements.append(Spacer(1, 12))
# Võtmesõnad ja meetodid
tags_data = []
if article['key_concepts']:
concepts_text = ", ".join(article['key_concepts'][:10]) # Piirangu 10 mõistele
tags_data.append(['Võtmesõnad:', concepts_text])
if article['methods_used']:
methods_text = ", ".join(article['methods_used'])
tags_data.append(['Meetodid:', methods_text])
if tags_data:
tags_table = Table(tags_data, colWidths=[2*cm, 12*cm])
tags_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
('FONTSIZE', (0, 0), (-1, -1), 9),
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
('TOPPADDING', (0, 0), (-1, -1), 4),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
('TEXTCOLOR', (0, 0), (0, -1), colors.HexColor('#5d6d7e')),
('LEFTPADDING', (0, 0), (0, -1), 0),
]))
elements.append(tags_table)
elements.append(Spacer(1, 12))
# Abstrakt
if article['abstract_en']:
elements.append(Paragraph("ABSTRAKT (inglise keeles):", section_style))
abstract_text = article['abstract_en']
if len(abstract_text) > 800:
abstract_text = abstract_text[:800] + "..."
elements.append(Paragraph(abstract_text, normal_style))
elements.append(Spacer(1, 12))
# Kokkuvõte
if article['summary_et']:
elements.append(Paragraph("KOKKUVÕTE (eesti keeles):", section_style))
# Formateeri kokkuvõte PDF-ile
summary = format_summary_for_pdf(article['summary_et'])
# Kasuta lihtsat tekstiparagraphi
elements.append(Paragraph(summary, normal_style))
elements.append(Spacer(1, 12))
# Transpordi kontekst
if article['transport_context']:
elements.append(Paragraph("TRANSFORDIPLANEERIMISE KONTEKST:", section_style))
context_text = format_context_for_pdf(article['transport_context'])
if context_text:
elements.append(Paragraph(context_text, normal_style))
elements.append(Spacer(1, 12))
# Allikfail ja töötlemise info
footer_info = []
if article['source_file']:
source_name = os.path.basename(article['source_file'])
footer_info.append(f"Allikfail: {source_name}")
if article['processing_date']:
# Proovi parsida kuupäeva
try:
# Eemalda mikrosekundid kui on
date_str = article['processing_date']
if '.' in date_str:
date_str = date_str.split('.')[0]
date_str = date_str.replace('Z', '+00:00')
date_obj = datetime.fromisoformat(date_str)
footer_info.append(f"Töödeldud: {date_obj.strftime('%d.%m.%Y %H:%M')}")
except Exception as e:
# Kui ei õnnestu parsida, kuva algne string (lõigatud)
footer_info.append(f"Töödeldud: {article['processing_date'][:19]}")
if footer_info:
elements.append(Spacer(1, 6))
elements.append(Paragraph(" | ".join(footer_info), metadata_style))
# Lisa lehevahetus (välja arvatud viimase artikli puhul)
if i < len(articles) - 1:
elements.append(PageBreak())
else:
elements.append(Spacer(1, 24))
# Lisa lõppinfo
elements.append(Paragraph("=" * 80, metadata_style))
elements.append(Spacer(1, 6))
elements.append(Paragraph(f"Kokku eksporditud artikleid: {len(articles)}", metadata_style))
elements.append(Paragraph("Eksporditud Weaviate teadusartiklite andmebaasist", metadata_style))
elements.append(Paragraph(f"PDF genereeritud: {datetime.now().strftime('%d.%m.%Y %H:%M:%S')}", metadata_style))
# Koosta PDF
doc.build(elements)
return len(articles)
def main():
"""Peamine funktsioon"""
print("=" * 60)
print("ARTIKLITE EKSPORT PDF FAILI")
print("=" * 60)
# Toob artiklid Weaviate'ist
print("Toon artikleid Weaviate'ist...")
articles = get_all_articles_from_weaviate()
if not articles:
print("Ei leidnud ühtegi artiklit Weaviate'is!")
return
print(f"Leidsin {len(articles)} artiklit")
# Genereeri PDF failinimi
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = "./data/exports"
os.makedirs(output_dir, exist_ok=True)
output_filename = os.path.join(output_dir, f"artiklid_eksport_{timestamp}.pdf")
# Loo PDF
print(f"Loon PDF faili: {output_filename}")
try:
article_count = create_pdf_from_articles(articles, output_filename)
print("=" * 60)
print(f"✅ VALMIS! Loodud PDF fail: {output_filename}")
print(f" - Eksporditud artikleid: {article_count}")
print(f" - Faili suurus: {os.path.getsize(output_filename) / 1024:.1f} KB")
print("=" * 60)
# Näita esimese artikli pealkirja
if articles:
print("\nEsimesed artiklid:")
for i, article in enumerate(articles[:3]):
title_preview = article['title']
if len(title_preview) > 60:
title_preview = title_preview[:60] + "..."
print(f" {i+1}. {title_preview}")
except Exception as e:
print(f"\n❌ VIGA PDF loomisel: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()