Browse Source

Artiklide eksport baasist pdf faili

Ardo Kubjas 4 tháng trước cách đây
mục cha
commit
db24d10478
1 tập tin đã thay đổi với 490 bổ sung0 xóa
  1. 490 0
      save_articles_to_pdf.py

+ 490 - 0
save_articles_to_pdf.py

@@ -0,0 +1,490 @@
+# save_articles_to_pdf.py
+
+import os
+import sys
+import re
+from datetime import datetime
+from reportlab.lib.pagesizes import letter, A4
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER, TA_LEFT
+from reportlab.lib import colors
+from reportlab.lib.units import inch, cm
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.ttfonts import TTFont
+import json
+
+# Lisa src kaust Pythoni teele
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+
+from src.weaviate_client import WeaviateClient
+
+def clean_html_tags(text):
+    """Puhasta tekst HTML/XML siltidest ReportLab-i jaoks"""
+    if not text:
+        return ""
+    
+    # Eemalda kõik HTML/XML siltid
+    text = re.sub(r'<[^>]+>', '', text)
+    
+    # Asenda erimärgid ReportLab-ile sobivate märkidega
+    replacements = {
+        '&nbsp;': ' ',
+        '&amp;': '&',
+        '&lt;': '<',
+        '&gt;': '>',
+        '&quot;': '"',
+        '&#39;': "'",
+        '&apos;': "'",
+        '\u00a0': ' ',  # mitte-tühik
+        '\u2026': '...',  # ellipsis
+        '\u2013': '-',   # n-sild
+        '\u2014': '-',   # m-sild
+        '\u2018': "'",   # vasak ülakoma
+        '\u2019': "'",   # parem ülakoma
+        '\u201c': '"',   # vasak jutumärk
+        '\u201d': '"',   # parem jutumärk
+    }
+    
+    for old, new in replacements.items():
+        text = text.replace(old, new)
+    
+    return text
+
+def clean_markdown_for_pdf(text):
+    """Konverteeri markdown ReportLab-ile sobivaks tekstiks"""
+    if not text:
+        return ""
+    
+    # Kui ei ole string, konverteeri stringiks
+    if not isinstance(text, str):
+        text = str(text)
+    
+    # Eemalda HTML siltid
+    text = clean_html_tags(text)
+    
+    # Asenda markdown pealkirjad
+    text = re.sub(r'#{1,6}\s+', '', text)  # Eemalda # pealkirjad
+    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)  # Asenda **bold** lihtsalt tekstiga
+    text = re.sub(r'\*(.+?)\*', r'\1', text)  # Asenda *italic* lihtsalt tekstiga
+    text = re.sub(r'__(.+?)__', r'\1', text)  # Asenda __underline__ lihtsalt tekstiga
+    text = re.sub(r'~~(.+?)~~', r'\1', text)  # Asenda ~~strikethrough~~ lihtsalt tekstiga
+    
+    # Asenda loetelud (PARANDATUD: kasuta \\1 mitte \1)
+    text = re.sub(r'^\s*[-*+]\s+', '• ', text, flags=re.MULTILINE)
+    text = re.sub(r'^\s*(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)  # PARANDATUD
+    
+    # Asenda koodiblokid
+    text = re.sub(r'```[^`]+```', '', text)  # Eemalda koodiblokid
+    text = re.sub(r'`([^`]+)`', r'[\1]', text)  # Asenda inline kood
+    
+    # Asenda lingid
+    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)  # Eemalda lingid
+    
+    # Eemalda liigsed tühikud
+    text = re.sub(r'\s+', ' ', text)
+    
+    return text.strip()
+
+def parse_transport_context(context_data):
+    """Parsi transpordikonteksti JSON-ist loetavaks"""
+    if isinstance(context_data, str):
+        # Proovi parsida string JSON-iks
+        try:
+            return json.loads(context_data)
+        except json.JSONDecodeError:
+            # Kui ei õnnestu, tagasta puhastatud string
+            return {"raw_analysis": clean_markdown_for_pdf(context_data)}
+    elif isinstance(context_data, dict):
+        # Puhasta kõik stringiväljad
+        cleaned = {}
+        for key, value in context_data.items():
+            if isinstance(value, str):
+                cleaned[key] = clean_markdown_for_pdf(value)
+            else:
+                cleaned[key] = value
+        return cleaned
+    return context_data
+
+def format_context_for_pdf(parsed_context):
+    """Vorminda parsitud kontekst PDF-i jaoks"""
+    if isinstance(parsed_context, dict):
+        formatted = []
+        
+        if 'theoretical_contribution' in parsed_context and parsed_context['theoretical_contribution']:
+            formatted.append(f"TEOREETILINE PANUS: {parsed_context['theoretical_contribution']}")
+        
+        if 'practical_applicability' in parsed_context and parsed_context['practical_applicability']:
+            formatted.append(f"PRAKTILINE RAKENDATAVUS: {parsed_context['practical_applicability']}")
+        
+        if 'problem_solving' in parsed_context and parsed_context['problem_solving']:
+            formatted.append(f"PROBLEEMILAHDUS: {parsed_context['problem_solving']}")
+        
+        if 'limitations' in parsed_context and parsed_context['limitations']:
+            formatted.append(f"PIIRANGUD: {parsed_context['limitations']}")
+        
+        if 'relevance_score' in parsed_context:
+            formatted.append(f"RELEVANTSUSE SKOOR: {parsed_context['relevance_score']}/10")
+        
+        if 'analysis' in parsed_context and parsed_context['analysis']:
+            formatted.append(f"ANALÜÜS: {parsed_context['analysis']}")
+        
+        if 'raw_analysis' in parsed_context and parsed_context['raw_analysis']:
+            formatted.append(f"ANALÜÜS: {parsed_context['raw_analysis']}")
+        
+        return "\n\n".join(formatted)
+    else:
+        return clean_markdown_for_pdf(str(parsed_context))
+
+def get_all_articles_from_weaviate():
+    """Toob kõik artiklid Weaviate'ist"""
+    client = WeaviateClient()
+    articles = []
+    
+    try:
+        collection = client.client.collections.get("ScientificArticle")
+        
+        # Loendi kokku
+        count_response = collection.aggregate.over_all(total_count=True)
+        total = count_response.total_count
+        
+        print(f"Weaviate'is leidsin {total} artiklit")
+        
+        if total > 0:
+            # Toob kõik artiklid
+            response = collection.query.fetch_objects(limit=total)
+            
+            for obj in response.objects:
+                try:
+                    article = {
+                        'article_id': obj.properties.get('article_id', 'N/A'),
+                        'title': clean_markdown_for_pdf(obj.properties.get('title', 'N/A')),
+                        'authors': obj.properties.get('authors', []),
+                        'year': obj.properties.get('year', 'N/A'),
+                        'journal': clean_markdown_for_pdf(obj.properties.get('journal', 'N/A')),
+                        'doi': obj.properties.get('doi', ''),
+                        'abstract_en': clean_markdown_for_pdf(obj.properties.get('abstract_en', '')),
+                        'summary_et': clean_markdown_for_pdf(obj.properties.get('summary_et', '')),
+                        'key_concepts': [clean_markdown_for_pdf(c) for c in obj.properties.get('key_concepts', [])],
+                        'methods_used': [clean_markdown_for_pdf(m) for m in obj.properties.get('methods_used', [])],
+                        'transport_context': parse_transport_context(obj.properties.get('transport_context', {})),
+                        'relevance_score': obj.properties.get('relevance_score', 'N/A'),
+                        'processing_date': obj.properties.get('processing_date', ''),
+                        'source_file': obj.properties.get('source_file', '')
+                    }
+                    articles.append(article)
+                except Exception as e:
+                    print(f"  Viga artikli {obj.properties.get('article_id', 'unknown')} töötlemisel: {e}")
+                    # Lisa artikel ilma puhastuseta
+                    article = {
+                        'article_id': obj.properties.get('article_id', 'N/A'),
+                        'title': str(obj.properties.get('title', 'N/A')),
+                        'authors': obj.properties.get('authors', []),
+                        'year': obj.properties.get('year', 'N/A'),
+                        'journal': str(obj.properties.get('journal', 'N/A')),
+                        'doi': obj.properties.get('doi', ''),
+                        'abstract_en': str(obj.properties.get('abstract_en', '')),
+                        'summary_et': str(obj.properties.get('summary_et', '')),
+                        'key_concepts': [str(c) for c in obj.properties.get('key_concepts', [])],
+                        'methods_used': [str(m) for m in obj.properties.get('methods_used', [])],
+                        'transport_context': str(obj.properties.get('transport_context', {})),
+                        'relevance_score': obj.properties.get('relevance_score', 'N/A'),
+                        'processing_date': obj.properties.get('processing_date', ''),
+                        'source_file': obj.properties.get('source_file', '')
+                    }
+                    articles.append(article)
+                
+    except Exception as e:
+        print(f"Viga artiklite toomisel: {e}")
+        import traceback
+        traceback.print_exc()
+    finally:
+        client.close()
+    
+    return articles
+
+def format_summary_for_pdf(summary):
+    """Vorminda kokkuvõte PDF-ile sobivaks"""
+    if not summary:
+        return ""
+    
+    # Kui ei ole string, konverteeri
+    if not isinstance(summary, str):
+        summary = str(summary)
+    
+    # Eemalda kõik vormindus ja tee lihtsaks tekstiks
+    summary = clean_markdown_for_pdf(summary)
+    
+    # Lisa uued read peamiste sektsioonide ette
+    summary = summary.replace('1. ARTIKLI PEAMISED PUNKTID:', '\n1. ARTIKLI PEAMISED PUNKTID:\n')
+    summary = summary.replace('2. KASUTATUD MEETODID:', '\n\n2. KASUTATUD MEETODID:\n')
+    summary = summary.replace('3. PEAMISED TULEMUSED:', '\n\n3. PEAMISED TULEMUSED:\n')
+    summary = summary.replace('4. JÄRELDUSED JA SOOVITUSED:', '\n\n4. JÄRELDUSED JA SOOVITUSED:\n')
+    summary = summary.replace('5. TRANSFORDIPLANEERIMISE KONTEKST:', '\n\n5. TRANSFORDIPLANEERIMISE KONTEKST:\n')
+    
+    # Asenda liigsed reavahetused
+    summary = re.sub(r'\n{3,}', '\n\n', summary)
+    
+    # Lõika liiga pikk tekst
+    if len(summary) > 4000:
+        summary = summary[:4000] + "... [kokkuvõte lõigatud, liiga pikk]"
+    
+    return summary
+
+def create_pdf_from_articles(articles, output_filename):
+    """Loob PDF faili artiklitest"""
+    
+    # Loo PDF dokument
+    doc = SimpleDocTemplate(
+        output_filename,
+        pagesize=A4,
+        rightMargin=72,
+        leftMargin=72,
+        topMargin=72,
+        bottomMargin=72
+    )
+    
+    # Stiilide loomine
+    styles = getSampleStyleSheet()
+    
+    # Kohandatud stiilid
+    title_style = ParagraphStyle(
+        'CustomTitle',
+        parent=styles['Heading1'],
+        fontSize=14,
+        spaceAfter=12,
+        textColor=colors.HexColor('#2c3e50'),
+        alignment=TA_LEFT
+    )
+    
+    subtitle_style = ParagraphStyle(
+        'CustomSubtitle',
+        parent=styles['Heading2'],
+        fontSize=12,
+        spaceAfter=6,
+        textColor=colors.HexColor('#34495e'),
+        alignment=TA_LEFT
+    )
+    
+    section_style = ParagraphStyle(
+        'CustomSection',
+        parent=styles['Heading3'],
+        fontSize=11,
+        spaceAfter=6,
+        spaceBefore=12,
+        textColor=colors.HexColor('#7f8c8d'),
+        alignment=TA_LEFT
+    )
+    
+    normal_style = ParagraphStyle(
+        'CustomNormal',
+        parent=styles['Normal'],
+        fontSize=10,
+        spaceAfter=6,
+        alignment=TA_JUSTIFY,
+        leading=14  # Reavahe
+    )
+    
+    metadata_style = ParagraphStyle(
+        'CustomMetadata',
+        parent=styles['Normal'],
+        fontSize=9,
+        spaceAfter=3,
+        textColor=colors.HexColor('#5d6d7e'),
+        alignment=TA_LEFT
+    )
+    
+    # Elementide kogumine
+    elements = []
+    
+    # Pealkiri ja kokkuvõte
+    elements.append(Paragraph("TEADUSARTIKLITE ANDMEBAAS", title_style))
+    elements.append(Spacer(1, 12))
+    
+    today = datetime.now().strftime("%d.%m.%Y %H:%M")
+    elements.append(Paragraph(f"Eksporditud: {today}", metadata_style))
+    elements.append(Paragraph(f"Artikleid kokku: {len(articles)}", metadata_style))
+    elements.append(Spacer(1, 24))
+    
+    # Iga artikli jaoks
+    for i, article in enumerate(articles):
+        # Artikli pealkiri
+        elements.append(Paragraph(f"{i+1}. {article['title']}", title_style))
+        
+        # Autorid
+        if article['authors']:
+            authors_text = ", ".join(article['authors'])
+            elements.append(Paragraph(f"<b>Autorid:</b> {authors_text}", subtitle_style))
+        
+        # Metaandmed tabelina
+        metadata_data = []
+        
+        if article['year'] and article['year'] != 'N/A':
+            metadata_data.append(['Aasta:', str(article['year'])])
+        
+        if article['journal'] and article['journal'] != 'N/A':
+            metadata_data.append(['Žurnaal:', article['journal']])
+        
+        if article['doi']:
+            metadata_data.append(['DOI:', article['doi']])
+        
+        if article['relevance_score'] and article['relevance_score'] != 'N/A':
+            metadata_data.append(['Relevantsus:', f"{article['relevance_score']}/10"])
+        
+        if metadata_data:
+            metadata_table = Table(metadata_data, colWidths=[2*cm, 12*cm])
+            metadata_table.setStyle(TableStyle([
+                ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
+                ('FONTSIZE', (0, 0), (-1, -1), 9),
+                ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
+                ('TOPPADDING', (0, 0), (-1, -1), 6),
+                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
+                ('LEFTPADDING', (0, 0), (0, -1), 0),
+            ]))
+            elements.append(metadata_table)
+            elements.append(Spacer(1, 12))
+        
+        # Võtmesõnad ja meetodid
+        tags_data = []
+        
+        if article['key_concepts']:
+            concepts_text = ", ".join(article['key_concepts'][:10])  # Piirangu 10 mõistele
+            tags_data.append(['Võtmesõnad:', concepts_text])
+        
+        if article['methods_used']:
+            methods_text = ", ".join(article['methods_used'])
+            tags_data.append(['Meetodid:', methods_text])
+        
+        if tags_data:
+            tags_table = Table(tags_data, colWidths=[2*cm, 12*cm])
+            tags_table.setStyle(TableStyle([
+                ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
+                ('FONTSIZE', (0, 0), (-1, -1), 9),
+                ('BOTTOMPADDING', (0, 0), (-1, -1), 4),
+                ('TOPPADDING', (0, 0), (-1, -1), 4),
+                ('VALIGN', (0, 0), (-1, -1), 'TOP'),
+                ('TEXTCOLOR', (0, 0), (0, -1), colors.HexColor('#5d6d7e')),
+                ('LEFTPADDING', (0, 0), (0, -1), 0),
+            ]))
+            elements.append(tags_table)
+            elements.append(Spacer(1, 12))
+        
+        # Abstrakt
+        if article['abstract_en']:
+            elements.append(Paragraph("<b>ABSTRAKT (inglise keeles):</b>", section_style))
+            abstract_text = article['abstract_en']
+            if len(abstract_text) > 800:
+                abstract_text = abstract_text[:800] + "..."
+            elements.append(Paragraph(abstract_text, normal_style))
+            elements.append(Spacer(1, 12))
+        
+        # Kokkuvõte
+        if article['summary_et']:
+            elements.append(Paragraph("<b>KOKKUVÕTE (eesti keeles):</b>", section_style))
+            
+            # Formateeri kokkuvõte PDF-ile
+            summary = format_summary_for_pdf(article['summary_et'])
+            
+            # Kasuta lihtsat tekstiparagraphi
+            elements.append(Paragraph(summary, normal_style))
+            elements.append(Spacer(1, 12))
+        
+        # Transpordi kontekst
+        if article['transport_context']:
+            elements.append(Paragraph("<b>TRANSFORDIPLANEERIMISE KONTEKST:</b>", section_style))
+            context_text = format_context_for_pdf(article['transport_context'])
+            if context_text:
+                elements.append(Paragraph(context_text, normal_style))
+                elements.append(Spacer(1, 12))
+        
+        # Allikfail ja töötlemise info
+        footer_info = []
+        if article['source_file']:
+            source_name = os.path.basename(article['source_file'])
+            footer_info.append(f"Allikfail: {source_name}")
+        
+        if article['processing_date']:
+            # Proovi parsida kuupäeva
+            try:
+                # Eemalda mikrosekundid kui on
+                date_str = article['processing_date']
+                if '.' in date_str:
+                    date_str = date_str.split('.')[0]
+                date_str = date_str.replace('Z', '+00:00')
+                date_obj = datetime.fromisoformat(date_str)
+                footer_info.append(f"Töödeldud: {date_obj.strftime('%d.%m.%Y %H:%M')}")
+            except Exception as e:
+                # Kui ei õnnestu parsida, kuva algne string (lõigatud)
+                footer_info.append(f"Töödeldud: {article['processing_date'][:19]}")
+        
+        if footer_info:
+            elements.append(Spacer(1, 6))
+            elements.append(Paragraph(" | ".join(footer_info), metadata_style))
+        
+        # Lisa lehevahetus (välja arvatud viimase artikli puhul)
+        if i < len(articles) - 1:
+            elements.append(PageBreak())
+        else:
+            elements.append(Spacer(1, 24))
+    
+    # Lisa lõppinfo
+    elements.append(Paragraph("=" * 80, metadata_style))
+    elements.append(Spacer(1, 6))
+    elements.append(Paragraph(f"Kokku eksporditud artikleid: {len(articles)}", metadata_style))
+    elements.append(Paragraph("Eksporditud Weaviate teadusartiklite andmebaasist", metadata_style))
+    elements.append(Paragraph(f"PDF genereeritud: {datetime.now().strftime('%d.%m.%Y %H:%M:%S')}", metadata_style))
+    
+    # Koosta PDF
+    doc.build(elements)
+    
+    return len(articles)
+
+def main():
+    """Peamine funktsioon"""
+    print("=" * 60)
+    print("ARTIKLITE EKSPORT PDF FAILI")
+    print("=" * 60)
+    
+    # Toob artiklid Weaviate'ist
+    print("Toon artikleid Weaviate'ist...")
+    articles = get_all_articles_from_weaviate()
+    
+    if not articles:
+        print("Ei leidnud ühtegi artiklit Weaviate'is!")
+        return
+    
+    print(f"Leidsin {len(articles)} artiklit")
+    
+    # Genereeri PDF failinimi
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    output_dir = "./data/exports"
+    os.makedirs(output_dir, exist_ok=True)
+    output_filename = os.path.join(output_dir, f"artiklid_eksport_{timestamp}.pdf")
+    
+    # Loo PDF
+    print(f"Loon PDF faili: {output_filename}")
+    try:
+        article_count = create_pdf_from_articles(articles, output_filename)
+        
+        print("=" * 60)
+        print(f"✅ VALMIS! Loodud PDF fail: {output_filename}")
+        print(f"   - Eksporditud artikleid: {article_count}")
+        print(f"   - Faili suurus: {os.path.getsize(output_filename) / 1024:.1f} KB")
+        print("=" * 60)
+        
+        # Näita esimese artikli pealkirja
+        if articles:
+            print("\nEsimesed artiklid:")
+            for i, article in enumerate(articles[:3]):
+                title_preview = article['title']
+                if len(title_preview) > 60:
+                    title_preview = title_preview[:60] + "..."
+                print(f"  {i+1}. {title_preview}")
+    
+    except Exception as e:
+        print(f"\n❌ VIGA PDF loomisel: {e}")
+        import traceback
+        traceback.print_exc()
+
+if __name__ == "__main__":
+    main()