# save_articles_to_pdf.py import os import sys import re from datetime import datetime from reportlab.lib.pagesizes import letter, A4 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER, TA_LEFT from reportlab.lib import colors from reportlab.lib.units import inch, cm from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont import json # Lisa src kaust Pythoni teele sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) from src.weaviate_client import WeaviateClient def clean_html_tags(text): """Puhasta tekst HTML/XML siltidest ReportLab-i jaoks""" if not text: return "" # Eemalda kõik HTML/XML siltid text = re.sub(r'<[^>]+>', '', text) # Asenda erimärgid ReportLab-ile sobivate märkidega replacements = { ' ': ' ', '&': '&', '<': '<', '>': '>', '"': '"', ''': "'", ''': "'", '\u00a0': ' ', # mitte-tühik '\u2026': '...', # ellipsis '\u2013': '-', # n-sild '\u2014': '-', # m-sild '\u2018': "'", # vasak ülakoma '\u2019': "'", # parem ülakoma '\u201c': '"', # vasak jutumärk '\u201d': '"', # parem jutumärk } for old, new in replacements.items(): text = text.replace(old, new) return text def clean_markdown_for_pdf(text): """Konverteeri markdown ReportLab-ile sobivaks tekstiks""" if not text: return "" # Kui ei ole string, konverteeri stringiks if not isinstance(text, str): text = str(text) # Eemalda HTML siltid text = clean_html_tags(text) # Asenda markdown pealkirjad text = re.sub(r'#{1,6}\s+', '', text) # Eemalda # pealkirjad text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # Asenda **bold** lihtsalt tekstiga text = re.sub(r'\*(.+?)\*', r'\1', text) # Asenda *italic* lihtsalt tekstiga text = re.sub(r'__(.+?)__', r'\1', text) # Asenda __underline__ lihtsalt tekstiga text = re.sub(r'~~(.+?)~~', r'\1', text) # Asenda ~~strikethrough~~ lihtsalt tekstiga # Asenda loetelud (PARANDATUD: kasuta \\1 mitte \1) text = re.sub(r'^\s*[-*+]\s+', '• ', text, flags=re.MULTILINE) text = re.sub(r'^\s*(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE) # PARANDATUD # Asenda koodiblokid text = re.sub(r'```[^`]+```', '', text) # Eemalda koodiblokid text = re.sub(r'`([^`]+)`', r'[\1]', text) # Asenda inline kood # Asenda lingid text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) # Eemalda lingid # Eemalda liigsed tühikud text = re.sub(r'\s+', ' ', text) return text.strip() def clean_json_markers(text): """ Eemaldab JSON ümber olevad ```json ja ``` markerid """ # Eemalda algusest text = re.sub(r'^```json\s*', '', text, flags=re.MULTILINE) # Eemalda lõpust text = re.sub(r'\s*```$', '', text, flags=re.MULTILINE) # Eemalda kõikjalt kui on text = re.sub(r'\s*```(json)?\s*', '', text) return text.strip() def clean_json_string(text): """Puhasta JSON string""" # Eemalda reavahetused ja liigsed tühikud text = text.replace('\n', '').replace('\r', '') # Asenda \\" tavaliste jutumärkidega text = text.replace('\\"', '"') # Eemalda esimesed ja viimased jutumärgid kui vaja if text.startswith('"') and text.endswith('"'): text = text[1:-1] return text def extract_and_format_json(data): """Eralda ja vorminda JSON andmed""" formatted_parts = [] # Võti-väärtus paaride kaart key_map = { "theoretical_contribution": "Theoretical contribution", "practical_applicability": "Practical applicability", "problem_solving": "Problem solving", "limitations": "Limitations", "future_research": "Future research", "methodology": "Methodology" } for key, title in key_map.items(): if key in data and data[key]: formatted_parts.append(f"{title}") formatted_parts.append(str(data[key])) formatted_parts.append("") # tühi rida return "\n".join(formatted_parts) def process_json_text(input_text): """Pööra JSON tekst loetavaks vorminguks""" try: # Parse esimene kiht parsed = json.loads(input_text) # Otsi analüüsi andmeid analysis_data = None # Variant 1: "analysis" väljal on JSON string if "analysis" in parsed: try: # Puhasta ja parse sisemine JSON clean_analysis = clean_json_string(str(parsed["analysis"])) analysis_data = json.loads(clean_analysis) except: # Kui ei saa JSON-iks, kasuta otse analysis_data = parsed.get("analysis", {}) # Variant 2: andmed otse pealkirjade all elif any(key in parsed for key in ["theoretical_contribution", "practical_applicability"]): analysis_data = parsed # Variant 3: teised võimalused else: # Proovi leida JSON kuskil mujal for key, value in parsed.items(): if isinstance(value, str) and any(x in value.lower() for x in ["theoretical", "practical", "contribution"]): try: clean_val = clean_json_string(value) analysis_data = json.loads(clean_val) break except: continue if analysis_data: return extract_and_format_json(analysis_data) else: return "No analysis data found in JSON" except json.JSONDecodeError as e: return f"JSON parsing error: {str(e)}" except Exception as e: return f"Error: {str(e)}" def parse_transport_context(context_data): context_data = clean_json_markers(context_data) """Parsi transpordikonteksti JSON-ist loetavaks""" if isinstance(context_data, str): # Proovi parsida string JSON-iks try: return json.loads(context_data) except json.JSONDecodeError: # Kui ei õnnestu, tagasta puhastatud string return {"raw_analysis": clean_markdown_for_pdf(context_data)} elif isinstance(context_data, dict): # Puhasta kõik stringiväljad cleaned = {} for key, value in context_data.items(): if isinstance(value, str): cleaned[key] = clean_markdown_for_pdf(value) else: cleaned[key] = value return cleaned return context_data def format_context_for_pdf(parsed_context): """Vorminda parsitud kontekst PDF-i jaoks""" if isinstance(parsed_context, dict): formatted = [] if 'theoretical_contribution' in parsed_context and parsed_context['theoretical_contribution']: formatted.append(f"TEOREETILINE PANUS: {parsed_context['theoretical_contribution']}") if 'practical_applicability' in parsed_context and parsed_context['practical_applicability']: formatted.append(f"PRAKTILINE RAKENDATAVUS: {parsed_context['practical_applicability']}") if 'problem_solving' in parsed_context and parsed_context['problem_solving']: formatted.append(f"PROBLEEMILAHDUS: {parsed_context['problem_solving']}") if 'limitations' in parsed_context and parsed_context['limitations']: formatted.append(f"PIIRANGUD: {parsed_context['limitations']}") if 'relevance_score' in parsed_context: formatted.append(f"RELEVANTSUSE SKOOR: {parsed_context['relevance_score']}/10") if 'analysis' in parsed_context and parsed_context['analysis']: formatted.append(f"ANALÜÜS: {parsed_context['analysis']}") if 'raw_analysis' in parsed_context and parsed_context['raw_analysis']: formatted.append(f"ANALÜÜS: {parsed_context['raw_analysis']}") return "\n\n".join(formatted) else: return clean_markdown_for_pdf(str(parsed_context)) def get_all_articles_from_weaviate(): """Toob kõik artiklid Weaviate'ist""" client = WeaviateClient() articles = [] try: collection = client.client.collections.get("ScientificArticle") # Loendi kokku count_response = collection.aggregate.over_all(total_count=True) total = count_response.total_count print(f"Weaviate'is leidsin {total} artiklit") if total > 0: # Toob kõik artiklid response = collection.query.fetch_objects(limit=total) for obj in response.objects: try: article = { 'article_id': obj.properties.get('article_id', 'N/A'), 'title': clean_markdown_for_pdf(obj.properties.get('title', 'N/A')), 'authors': obj.properties.get('authors', []), 'year': obj.properties.get('year', 'N/A'), 'journal': clean_markdown_for_pdf(obj.properties.get('journal', 'N/A')), 'doi': obj.properties.get('doi', ''), 'abstract_en': clean_markdown_for_pdf(obj.properties.get('abstract_en', '')), 'summary_et': clean_markdown_for_pdf(obj.properties.get('summary_et', '')), 'key_concepts': [clean_markdown_for_pdf(c) for c in obj.properties.get('key_concepts', [])], 'methods_used': [clean_markdown_for_pdf(m) for m in obj.properties.get('methods_used', [])], 'transport_context': parse_transport_context(obj.properties.get('transport_context', {})), 'relevance_score': obj.properties.get('relevance_score', 'N/A'), 'processing_date': obj.properties.get('processing_date', ''), 'source_file': obj.properties.get('source_file', '') } articles.append(article) except Exception as e: print(f" Viga artikli {obj.properties.get('article_id', 'unknown')} töötlemisel: {e}") # Lisa artikel ilma puhastuseta article = { 'article_id': obj.properties.get('article_id', 'N/A'), 'title': str(obj.properties.get('title', 'N/A')), 'authors': obj.properties.get('authors', []), 'year': obj.properties.get('year', 'N/A'), 'journal': str(obj.properties.get('journal', 'N/A')), 'doi': obj.properties.get('doi', ''), 'abstract_en': str(obj.properties.get('abstract_en', '')), 'summary_et': str(obj.properties.get('summary_et', '')), 'key_concepts': [str(c) for c in obj.properties.get('key_concepts', [])], 'methods_used': [str(m) for m in obj.properties.get('methods_used', [])], 'transport_context': str(obj.properties.get('transport_context', {})), 'relevance_score': obj.properties.get('relevance_score', 'N/A'), 'processing_date': obj.properties.get('processing_date', ''), 'source_file': obj.properties.get('source_file', '') } articles.append(article) except Exception as e: print(f"Viga artiklite toomisel: {e}") import traceback traceback.print_exc() finally: client.close() return articles def format_summary_for_pdf(summary): """Vorminda kokkuvõte PDF-ile sobivaks""" if not summary: return "" # Kui ei ole string, konverteeri if not isinstance(summary, str): summary = str(summary) # Eemalda kõik vormindus ja tee lihtsaks tekstiks summary = clean_markdown_for_pdf(summary) # Lisa uued read peamiste sektsioonide ette summary = summary.replace('1. ARTIKLI PEAMISED PUNKTID:', '\n1. ARTIKLI PEAMISED PUNKTID:\n') summary = summary.replace('2. KASUTATUD MEETODID:', '\n\n2. KASUTATUD MEETODID:\n') summary = summary.replace('3. PEAMISED TULEMUSED:', '\n\n3. PEAMISED TULEMUSED:\n') summary = summary.replace('4. JÄRELDUSED JA SOOVITUSED:', '\n\n4. JÄRELDUSED JA SOOVITUSED:\n') summary = summary.replace('5. TRANSFORDIPLANEERIMISE KONTEKST:', '\n\n5. TRANSFORDIPLANEERIMISE KONTEKST:\n') # Asenda liigsed reavahetused summary = re.sub(r'\n{3,}', '\n\n', summary) # Lõika liiga pikk tekst if len(summary) > 4000: summary = summary[:4000] + "... [kokkuvõte lõigatud, liiga pikk]" return summary def create_pdf_from_articles(articles, output_filename): """Loob PDF faili artiklitest""" # Loo PDF dokument doc = SimpleDocTemplate( output_filename, pagesize=A4, rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=72 ) # Stiilide loomine styles = getSampleStyleSheet() # Kohandatud stiilid title_style = ParagraphStyle( 'CustomTitle', parent=styles['Heading1'], fontSize=14, spaceAfter=12, textColor=colors.HexColor('#2c3e50'), alignment=TA_LEFT ) subtitle_style = ParagraphStyle( 'CustomSubtitle', parent=styles['Heading2'], fontSize=12, spaceAfter=6, textColor=colors.HexColor('#34495e'), alignment=TA_LEFT ) section_style = ParagraphStyle( 'CustomSection', parent=styles['Heading3'], fontSize=11, spaceAfter=6, spaceBefore=12, textColor=colors.HexColor('#7f8c8d'), alignment=TA_LEFT ) normal_style = ParagraphStyle( 'CustomNormal', parent=styles['Normal'], fontSize=10, spaceAfter=6, alignment=TA_JUSTIFY, leading=14 # Reavahe ) metadata_style = ParagraphStyle( 'CustomMetadata', parent=styles['Normal'], fontSize=9, spaceAfter=3, textColor=colors.HexColor('#5d6d7e'), alignment=TA_LEFT ) # Elementide kogumine elements = [] # Pealkiri ja kokkuvõte elements.append(Paragraph("TEADUSARTIKLITE ANDMEBAAS", title_style)) elements.append(Spacer(1, 12)) today = datetime.now().strftime("%d.%m.%Y %H:%M") elements.append(Paragraph(f"Eksporditud: {today}", metadata_style)) elements.append(Paragraph(f"Artikleid kokku: {len(articles)}", metadata_style)) elements.append(Spacer(1, 24)) # Iga artikli jaoks for i, article in enumerate(articles): # Artikli pealkiri elements.append(Paragraph(f"{i+1}. {article['title']}", title_style)) # Autorid if article['authors']: authors_text = ", ".join(article['authors']) elements.append(Paragraph(f"Autorid: {authors_text}", subtitle_style)) # Metaandmed tabelina metadata_data = [] if article['year'] and article['year'] != 'N/A': metadata_data.append(['Aasta:', str(article['year'])]) if article['journal'] and article['journal'] != 'N/A': metadata_data.append(['Žurnaal:', article['journal']]) if article['doi']: metadata_data.append(['DOI:', article['doi']]) if article['relevance_score'] and article['relevance_score'] != 'N/A': metadata_data.append(['Relevantsus:', f"{article['relevance_score']}/10"]) if metadata_data: metadata_table = Table(metadata_data, colWidths=[2*cm, 12*cm]) metadata_table.setStyle(TableStyle([ ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'), ('FONTSIZE', (0, 0), (-1, -1), 9), ('BOTTOMPADDING', (0, 0), (-1, -1), 6), ('TOPPADDING', (0, 0), (-1, -1), 6), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('LEFTPADDING', (0, 0), (0, -1), 0), ])) elements.append(metadata_table) elements.append(Spacer(1, 12)) # Võtmesõnad ja meetodid tags_data = [] if article['key_concepts']: concepts_text = ", ".join(article['key_concepts'][:10]) # Piirangu 10 mõistele tags_data.append(['Võtmesõnad:', concepts_text]) if article['methods_used']: methods_text = ", ".join(article['methods_used']) tags_data.append(['Meetodid:', methods_text]) if tags_data: tags_table = Table(tags_data, colWidths=[2*cm, 12*cm]) tags_table.setStyle(TableStyle([ ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'), ('FONTSIZE', (0, 0), (-1, -1), 9), ('BOTTOMPADDING', (0, 0), (-1, -1), 4), ('TOPPADDING', (0, 0), (-1, -1), 4), ('VALIGN', (0, 0), (-1, -1), 'TOP'), ('TEXTCOLOR', (0, 0), (0, -1), colors.HexColor('#5d6d7e')), ('LEFTPADDING', (0, 0), (0, -1), 0), ])) elements.append(tags_table) elements.append(Spacer(1, 12)) # Abstrakt if article['abstract_en']: elements.append(Paragraph("ABSTRAKT (inglise keeles):", section_style)) abstract_text = article['abstract_en'] if len(abstract_text) > 800: abstract_text = abstract_text[:800] + "..." elements.append(Paragraph(abstract_text, normal_style)) elements.append(Spacer(1, 12)) # Kokkuvõte if article['summary_et']: elements.append(Paragraph("KOKKUVÕTE (eesti keeles):", section_style)) # Formateeri kokkuvõte PDF-ile summary = format_summary_for_pdf(article['summary_et']) # Kasuta lihtsat tekstiparagraphi elements.append(Paragraph(summary, normal_style)) elements.append(Spacer(1, 12)) # Transpordi kontekst if article['transport_context']: # Debugimiseks debugger_data = str(article['transport_context']) print("----------- \"" + article['title'] + "\" -----------") print("----------- article['transport_context'] -------------") print(debugger_data) elements.append(Paragraph("TRANSFORDIPLANEERIMISE KONTEKST:", section_style)) context_text = format_context_for_pdf(article['transport_context']['relevance_score']) if context_text: elements.append(Paragraph("RELEVANTSUSE SKOOR: " + context_text, normal_style)) elements.append(Spacer(1, 1)) elements.append(Paragraph("ANALÜÜS:", normal_style)) analysis_text = article['transport_context']['analysis'] # Proovi leida theoretical_contribution regex'iga match = re.search(r'"theoretical_contribution":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text) if match: context_text = match.group(1) if context_text: elements.append(Paragraph("TEOREETILINE PANUS:", normal_style)) elements.append(Paragraph(context_text, normal_style)) # Proovi leida practical_applicability regex'iga match = re.search(r'"practical_applicability":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text) if match: context_text = match.group(1) if context_text: elements.append(Paragraph("PRAKTILINE RAKENDATAVUS:", normal_style)) elements.append(Paragraph(context_text, normal_style)) # Proovi leida problem_solving regex'iga match = re.search(r'"problem_solving":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text) if match: context_text = match.group(1) if context_text: elements.append(Paragraph("PROBLEEMILAHENDUS:", normal_style)) elements.append(Paragraph(context_text, normal_style)) # Proovi leida limitations regex'iga match = re.search(r'"limitations":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text) if match: context_text = match.group(1) if context_text: elements.append(Paragraph("PIIRANGUD:", normal_style)) elements.append(Paragraph(context_text, normal_style)) # Proovi leida relevance_score regex'iga match = re.search(r'"relevance_score":\s*(\d+(?:\.\d+)?)', analysis_text) if match: context_text = match.group(1) if context_text: elements.append(Paragraph("RELEVANTSUSE SKOOR: " + context_text, normal_style)) # Allikfail ja töötlemise info footer_info = [] if article['source_file']: source_name = os.path.basename(article['source_file']) footer_info.append(f"Allikfail: {source_name}") if article['processing_date']: # Proovi parsida kuupäeva try: # Eemalda mikrosekundid kui on date_str = article['processing_date'] if '.' in date_str: date_str = date_str.split('.')[0] date_str = date_str.replace('Z', '+00:00') date_obj = datetime.fromisoformat(date_str) footer_info.append(f"Töödeldud: {date_obj.strftime('%d.%m.%Y %H:%M')}") except Exception as e: # Kui ei õnnestu parsida, kuva algne string (lõigatud) footer_info.append(f"Töödeldud: {article['processing_date'][:19]}") if footer_info: elements.append(Spacer(1, 6)) elements.append(Paragraph(" | ".join(footer_info), metadata_style)) # Lisa lehevahetus (välja arvatud viimase artikli puhul) if i < len(articles) - 1: elements.append(PageBreak()) else: elements.append(Spacer(1, 24)) # Lisa lõppinfo elements.append(Paragraph("=" * 80, metadata_style)) elements.append(Spacer(1, 6)) elements.append(Paragraph(f"Kokku eksporditud artikleid: {len(articles)}", metadata_style)) elements.append(Paragraph("Eksporditud Weaviate teadusartiklite andmebaasist", metadata_style)) elements.append(Paragraph(f"PDF genereeritud: {datetime.now().strftime('%d.%m.%Y %H:%M:%S')}", metadata_style)) # Koosta PDF doc.build(elements) return len(articles) def main(): """Peamine funktsioon""" print("=" * 60) print("ARTIKLITE EKSPORT PDF FAILI") print("=" * 60) # Toob artiklid Weaviate'ist print("Toon artikleid Weaviate'ist...") articles = get_all_articles_from_weaviate() if not articles: print("Ei leidnud ühtegi artiklit Weaviate'is!") return print(f"Leidsin {len(articles)} artiklit") # Genereeri PDF failinimi timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_dir = "./data/exports" os.makedirs(output_dir, exist_ok=True) output_filename = os.path.join(output_dir, f"artiklid_eksport_{timestamp}.pdf") # Loo PDF print(f"Loon PDF faili: {output_filename}") try: article_count = create_pdf_from_articles(articles, output_filename) print("=" * 60) print(f"✅ VALMIS! Loodud PDF fail: {output_filename}") print(f" - Eksporditud artikleid: {article_count}") print(f" - Faili suurus: {os.path.getsize(output_filename) / 1024:.1f} KB") print("=" * 60) # Näita esimese artikli pealkirja if articles: print("\nEsimesed artiklid:") for i, article in enumerate(articles[:3]): title_preview = article['title'] if len(title_preview) > 60: title_preview = title_preview[:60] + "..." print(f" {i+1}. {title_preview}") except Exception as e: print(f"\n❌ VIGA PDF loomisel: {e}") import traceback traceback.print_exc() if __name__ == "__main__": main()