Переглянути джерело

Artiklide eksport uus puhastatud versioon

Ardo Kubjas 3 місяців тому
батько
коміт
f80f768427
1 змінених файлів з 421 додано та 348 видалено
  1. 421 348
      save_articles_to_pdf.py

+ 421 - 348
save_articles_to_pdf.py

@@ -1,43 +1,45 @@
-# save_articles_to_pdf.py
-
 import os
 import sys
 import re
 from datetime import datetime
-from reportlab.lib.pagesizes import letter, A4
+
+from reportlab.lib.pagesizes import A4
 from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
-from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER, TA_LEFT
+from reportlab.lib.enums import TA_JUSTIFY, TA_LEFT
 from reportlab.lib import colors
-from reportlab.lib.units import inch, cm
-from reportlab.pdfbase import pdfmetrics
-from reportlab.pdfbase.ttfonts import TTFont
+from reportlab.lib.units import cm
+
 import json
+import traceback
 
 # Lisa src kaust Pythoni teele
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
-
 from src.weaviate_client import WeaviateClient
 
+
+# ============================================================================
+# STRINGI PUHASTAMISE FUNKTSIOONID
+# ============================================================================
+
 def clean_html_tags(text):
     """Puhasta tekst HTML/XML siltidest ReportLab-i jaoks"""
     if not text:
         return ""
-    
-    # Eemalda kõik HTML/XML siltid
+    # Eemalda kõik HTML/XML sildid
     text = re.sub(r'<[^>]+>', '', text)
-    
+
     # Asenda erimärgid ReportLab-ile sobivate märkidega
     replacements = {
-        '&nbsp;': ' ',
-        '&amp;': '&',
-        '&lt;': '<',
-        '&gt;': '>',
-        '&quot;': '"',
-        '&#39;': "'",
-        '&apos;': "'",
-        '\u00a0': ' ',  # mitte-tühik
-        '\u2026': '...',  # ellipsis
+        ' ': ' ',
+        '&': '&',
+        '<': '<',
+        '>': '>',
+        '"': '"',
+        '': "'",
+        '': "'",
+        '\u00a0': ' ',   # mitte-tühik
+        '\u2026': '...', # ellipsis
         '\u2013': '-',   # n-sild
         '\u2014': '-',   # m-sild
         '\u2018': "'",   # vasak ülakoma
@@ -45,206 +47,327 @@ def clean_html_tags(text):
         '\u201c': '"',   # vasak jutumärk
         '\u201d': '"',   # parem jutumärk
     }
-    
+
     for old, new in replacements.items():
         text = text.replace(old, new)
-    
+
     return text
 
+
 def clean_markdown_for_pdf(text):
     """Konverteeri markdown ReportLab-ile sobivaks tekstiks"""
     if not text:
         return ""
-    
+
     # Kui ei ole string, konverteeri stringiks
     if not isinstance(text, str):
         text = str(text)
-    
-    # Eemalda HTML siltid
+
+    # Eemalda HTML sildid
     text = clean_html_tags(text)
-    
-    # Asenda markdown pealkirjad
-    text = re.sub(r'#{1,6}\s+', '', text)  # Eemalda # pealkirjad
-    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)  # Asenda **bold** lihtsalt tekstiga
-    text = re.sub(r'\*(.+?)\*', r'\1', text)  # Asenda *italic* lihtsalt tekstiga
-    text = re.sub(r'__(.+?)__', r'\1', text)  # Asenda __underline__ lihtsalt tekstiga
-    text = re.sub(r'~~(.+?)~~', r'\1', text)  # Asenda ~~strikethrough~~ lihtsalt tekstiga
-    
-    # Asenda loetelud (PARANDATUD: kasuta \\1 mitte \1)
+
+    # Asenda markdown pealkirjad (# # # jne)
+    text = re.sub(r'#{1,6}\s+', '', text)
+
+    # Asenda bold, italic, strikethrough markeeringud
+    text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)  # **bold**
+    text = re.sub(r'\*(.+?)\*', r'\1', text)      # *italic*
+    text = re.sub(r'__(.+?)__', r'\1', text)      # __underline__
+    text = re.sub(r'~~(.+?)~~', r'\1', text)      # ~~strikethrough~~
+
+    # Asenda loetelud
     text = re.sub(r'^\s*[-*+]\s+', '• ', text, flags=re.MULTILINE)
-    text = re.sub(r'^\s*(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)  # PARANDATUD
-    
-    # Asenda koodiblokid
-    text = re.sub(r'```[^`]+```', '', text)  # Eemalda koodiblokid
-    text = re.sub(r'`([^`]+)`', r'[\1]', text)  # Asenda inline kood
-    
-    # Asenda lingid
-    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)  # Eemalda lingid
-    
+    text = re.sub(r'^\s*(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)
+
+    # Eemalda koodiblokid
+    text = re.sub(r'```[^`]+```', '', text)
+    text = re.sub(r'`([^`]+)`', r'[\1]', text)
+
+    # Eemalda lingid (jäta ainult tekst)
+    text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
+
     # Eemalda liigsed tühikud
     text = re.sub(r'\s+', ' ', text)
-    
+
     return text.strip()
 
-def clean_json_markers(text):
+
+def format_summary_for_pdf(summary):
+    """Vorminda kokkuvõte PDF-ile sobivaks"""
+    if not summary:
+        return ""
+
+    # Kui ei ole string, konverteeri
+    if not isinstance(summary, str):
+        summary = str(summary)
+
+    # Eemalda kõik vormindus
+    summary = clean_markdown_for_pdf(summary)
+
+    # Lisa uued read peamiste sektsioonide ette
+    summary = summary.replace('1. ARTIKLI PEAMISED PUNKTID:', '\n1. ARTIKLI PEAMISED PUNKTID:\n')
+    summary = summary.replace('2. KASUTATUD MEETODID:', '\n\n2. KASUTATUD MEETODID:\n')
+    summary = summary.replace('3. PEAMISED TULEMUSED:', '\n\n3. PEAMISED TULEMUSED:\n')
+    summary = summary.replace('4. JÄRELDUSED JA SOOVITUSED:', '\n\n4. JÄRELDUSED JA SOOVITUSED:\n')
+    summary = summary.replace('5. TRANSFORDIPLANEERIMISE KONTEKST:', '\n\n5. TRANSFORDIPLANEERIMISE KONTEKST:\n')
+
+    # Asenda liigsed reavahetused
+    summary = re.sub(r'\n{3,}', '\n\n', summary)
+
+    # Lõika liiga pikk tekst
+    if len(summary) > 4000:
+        summary = summary[:4000] + "... [kokkuvõte lõigatud, liiga pikk]"
+
+    return summary
+
+
+# ============================================================================
+# TRANSPORT KONTEKSTI PARSING
+# ============================================================================
+
+def extract_json_field(json_string, field_name):
     """
-    Eemaldab JSON ümber olevad ```json ja ``` markerid
+    Eralda JSON stringist konkreetne väli regex abil.
+
+    Args:
+        json_string: JSON tekst stringina
+        field_name: välja nimi (nt "theoretical_contribution")
+
+    Returns:
+        Välja väärtus või None
     """
-    # Eemalda algusest
-    text = re.sub(r'^```json\s*', '', text, flags=re.MULTILINE)
-    # Eemalda lõpust
-    text = re.sub(r'\s*```$', '', text, flags=re.MULTILINE)
-    # Eemalda kõikjalt kui on
-    text = re.sub(r'\s*```(json)?\s*', '', text)
-    return text.strip()
+    if not json_string:
+        return None
 
-def clean_json_string(text):
-    """Puhasta JSON string"""
-    # Eemalda reavahetused ja liigsed tühikud
-    text = text.replace('\n', '').replace('\r', '')
-    # Asenda \\" tavaliste jutumärkidega
-    text = text.replace('\\"', '"')
-    # Eemalda esimesed ja viimased jutumärgid kui vaja
-    if text.startswith('"') and text.endswith('"'):
-        text = text[1:-1]
-    return text
+    pattern = rf'"{field_name}":\s*"([^"]*(?:\\"[^"]*)*)"'
+    match = re.search(pattern, json_string)
+    if match:
+        return match.group(1)
+    return None
 
-def extract_and_format_json(data):
-    """Eralda ja vorminda JSON andmed"""
-    formatted_parts = []
-    
-    # Võti-väärtus paaride kaart
-    key_map = {
-        "theoretical_contribution": "Theoretical contribution",
-        "practical_applicability": "Practical applicability", 
-        "problem_solving": "Problem solving",
-        "limitations": "Limitations",
-        "future_research": "Future research",
-        "methodology": "Methodology"
-    }
+
+def extract_relevance_score(json_string):
+    """Eralda relevance_score JSON stringist või tekstist."""
+    if not json_string:
+        return None
+
+    pattern = r'"relevance_score":\s*(\d+(?:\.\d+)?)'
+    match = re.search(pattern, json_string)
+    if match:
+        return match.group(1)
+    return None
+
+
+def format_transport_context(transport_context):
+    """
+    Vorminda transpordi kontekst PDF-ile.
+    """
+    if not transport_context:
+        return None
+
+    # PARANDUS: Leia analüüsi tekst
+    analysis_text = None
     
-    for key, title in key_map.items():
-        if key in data and data[key]:
-            formatted_parts.append(f"{title}")
-            formatted_parts.append(str(data[key]))
-            formatted_parts.append("")  # tühi rida
+    # Kui on dict ja sisaldab 'analysis' võtit
+    if isinstance(transport_context, dict):
+        if 'analysis' in transport_context:
+            analysis_text = transport_context['analysis']
+        else:
+            # Võib-olla kogu dict ON juba analysis?
+            analysis_text = str(transport_context)
+    elif isinstance(transport_context, str):
+        analysis_text = transport_context
+    else:
+        return None
+
+    if not analysis_text:
+        return None
+
+    txt = str(analysis_text)
     
-    return "\n".join(formatted_parts)
+    # EEMALDA ```json ... ``` markerid
+    txt = re.sub(r'```json\s*', '', txt)
+    txt = re.sub(r'\s*```', '', txt)
+    txt = txt.strip()
+
+    # DEBUG: Prindi välja pärast puhastamist
+    #print("DEBUG after cleanup:", txt[:200])
+
+    parsed = None
 
-def process_json_text(input_text):
-    """Pööra JSON tekst loetavaks vorminguks"""
+    # 1) Proovi parsida JSON otse
     try:
-        # Parse esimene kiht
-        parsed = json.loads(input_text)
-        
-        # Otsi analüüsi andmeid
-        analysis_data = None
-        
-        # Variant 1: "analysis" väljal on JSON string
-        if "analysis" in parsed:
+        parsed = json.loads(txt)
+        #print("✅ JSON parsed successfully!")
+        #print(f"DEBUG parsed keys: {list(parsed.keys())}")
+    except json.JSONDecodeError as e:
+        #print(f"❌ JSON parse failed: {e}")
+        # Kui ei õnnestu, proovi leida {...} blokk
+        m = re.search(r'\{.*\}', txt, flags=re.DOTALL)
+        if m:
+            json_candidate = m.group(0)
             try:
-                # Puhasta ja parse sisemine JSON
-                clean_analysis = clean_json_string(str(parsed["analysis"]))
-                analysis_data = json.loads(clean_analysis)
-            except:
-                # Kui ei saa JSON-iks, kasuta otse
-                analysis_data = parsed.get("analysis", {})
-        
-        # Variant 2: andmed otse pealkirjade all
-        elif any(key in parsed for key in ["theoretical_contribution", "practical_applicability"]):
-            analysis_data = parsed
-        
-        # Variant 3: teised võimalused
-        else:
-            # Proovi leida JSON kuskil mujal
-            for key, value in parsed.items():
-                if isinstance(value, str) and any(x in value.lower() for x in ["theoretical", "practical", "contribution"]):
-                    try:
-                        clean_val = clean_json_string(value)
-                        analysis_data = json.loads(clean_val)
-                        break
-                    except:
-                        continue
-        
-        if analysis_data:
-            return extract_and_format_json(analysis_data)
-        else:
-            return "No analysis data found in JSON"
+                parsed = json.loads(json_candidate)
+                #print("✅ JSON parsed from candidate!")
+            except json.JSONDecodeError as e2:
+                print(f"❌ Candidate parse also failed: {e2}")
+                parsed = None
+
+    formatted_parts = []
+
+    # Kui JSON parsimine õnnestus
+    if isinstance(parsed, dict):
+        #print("✅ Using parsed JSON dict")
+        
+        # KUI parsed sisaldab 'analysis' võtit, siis see on VEEL ÜKS string!
+        # Peame seda UUESTI parsima!
+        # KUI parsed sisaldab 'analysis' võtit, siis see on VEEL ÜKS string!
+        if 'analysis' in parsed and isinstance(parsed['analysis'], str):
+            #print("⚠️ 'analysis' is still a string, parsing again...")
+            inner_txt = parsed['analysis']
+            # Eemalda ```json markerid uuesti
+            inner_txt = re.sub(r'```json\s*', '', inner_txt)
+            inner_txt = re.sub(r'\s*```', '', inner_txt)
+            inner_txt = inner_txt.strip()
+            # PARANDUS: Asenda valed escaped quotes
+            # \"word\" → "word" (ainult siis kui on tähtede vahel)
+            inner_txt = re.sub(r'\\"([a-züõäöA-ZÜÕÄÖ]+)\\"', r'"\1"', inner_txt)
             
-    except json.JSONDecodeError as e:
-        return f"JSON parsing error: {str(e)}"
-    except Exception as e:
-        return f"Error: {str(e)}"
-
-def parse_transport_context(context_data):
-    context_data = clean_json_markers(context_data)
-    """Parsi transpordikonteksti JSON-ist loetavaks"""
-    if isinstance(context_data, str):
-        # Proovi parsida string JSON-iks
-        try:
-            return json.loads(context_data)
-        except json.JSONDecodeError:
-            # Kui ei õnnestu, tagasta puhastatud string
-            return {"raw_analysis": clean_markdown_for_pdf(context_data)}
-    elif isinstance(context_data, dict):
-        # Puhasta kõik stringiväljad
-        cleaned = {}
-        for key, value in context_data.items():
-            if isinstance(value, str):
-                cleaned[key] = clean_markdown_for_pdf(value)
-            else:
-                cleaned[key] = value
-        return cleaned
-    return context_data
-
-def format_context_for_pdf(parsed_context):
-    """Vorminda parsitud kontekst PDF-i jaoks"""
-    if isinstance(parsed_context, dict):
-        formatted = []
-        
-        if 'theoretical_contribution' in parsed_context and parsed_context['theoretical_contribution']:
-            formatted.append(f"TEOREETILINE PANUS: {parsed_context['theoretical_contribution']}")
-        
-        if 'practical_applicability' in parsed_context and parsed_context['practical_applicability']:
-            formatted.append(f"PRAKTILINE RAKENDATAVUS: {parsed_context['practical_applicability']}")
-        
-        if 'problem_solving' in parsed_context and parsed_context['problem_solving']:
-            formatted.append(f"PROBLEEMILAHDUS: {parsed_context['problem_solving']}")
-        
-        if 'limitations' in parsed_context and parsed_context['limitations']:
-            formatted.append(f"PIIRANGUD: {parsed_context['limitations']}")
-        
-        if 'relevance_score' in parsed_context:
-            formatted.append(f"RELEVANTSUSE SKOOR: {parsed_context['relevance_score']}/10")
-        
-        if 'analysis' in parsed_context and parsed_context['analysis']:
-            formatted.append(f"ANALÜÜS: {parsed_context['analysis']}")
-        
-        if 'raw_analysis' in parsed_context and parsed_context['raw_analysis']:
-            formatted.append(f"ANALÜÜS: {parsed_context['raw_analysis']}")
-        
-        return "\n\n".join(formatted)
-    else:
-        return clean_markdown_for_pdf(str(parsed_context))
+            # PARANDUS: Escape jutumärgid, mis on stringi väärtuste sees
+            # Leia kõik "key": "value" paare ja escape "value" sees olevad jutumärgid
+            def escape_quotes_in_values(match):
+                key = match.group(1)
+                value = match.group(2)
+                # Escape jutumärgid value sees
+                value_escaped = value.replace('"', '\\"')
+                return f'"{key}": "{value_escaped}"'
+            
+            inner_txt = re.sub(r'"(\w+)":\s*"([^"]*(?:"[^"]*)*)"', escape_quotes_in_values, inner_txt)
+            
+            try:
+                parsed = json.loads(inner_txt)
+                #print("✅ Inner JSON parsed successfully!")
+                #print(f"DEBUG inner parsed keys: {list(parsed.keys())}")
+            except json.JSONDecodeError as e:
+                #print(f"❌ Inner JSON parse failed: {e}")
+                
+                # AGRESSIIVNE PARANDUS: kasuta regex fallback'i
+                #print("⚠️ Falling back to regex extraction...")
+                
+                # Taasta originaal inner_txt (ilma escapimiseta)
+                inner_txt = parsed['analysis']
+                inner_txt = re.sub(r'```json\s*', '', inner_txt)
+                inner_txt = re.sub(r'\s*```', '', inner_txt)
+                inner_txt = inner_txt.strip()
+                
+                # Kasuta regex'i otse inner_txt pealt
+                temp_parts = []
+                
+                match = re.search(r'"theoretical_contribution":\s*"(.*?)"(?=\s*,\s*")', inner_txt, flags=re.DOTALL)
+                if match:
+                    temp_parts.append("TEOREETILINE PANUS:\n" + match.group(1))
+                
+                match = re.search(r'"practical_applicability":\s*"(.*?)"(?=\s*,\s*")', inner_txt, flags=re.DOTALL)
+                if match:
+                    temp_parts.append("PRAKTILINE RAKENDATAVUS:\n" + match.group(1))
+                
+                match = re.search(r'"problem_solving":\s*"(.*?)"(?=\s*,\s*")', inner_txt, flags=re.DOTALL)
+                if match:
+                    temp_parts.append("PROBLEEMILAHENDUS:\n" + match.group(1))
+                
+                match = re.search(r'"limitations":\s*"(.*?)"(?=\s*,\s*")', inner_txt, flags=re.DOTALL)
+                if match:
+                    temp_parts.append("PIIRANGUD:\n" + match.group(1))
+                
+                match = re.search(r'"relevance_score":\s*(\d+)', inner_txt)
+                if match:
+                    temp_parts.append(f"RELEVANTSUSE SKOOR: {match.group(1)}/10")
+                
+                if temp_parts:
+                    #print(f"✅ Regex extracted {len(temp_parts)} parts")
+                    return "\n\n".join(temp_parts)
+        
+        # Nüüd kasuta parsed dict'i
+        if parsed.get("theoretical_contribution"):
+            formatted_parts.append(
+                "TEOREETILINE PANUS:\n" + str(parsed["theoretical_contribution"])
+            )
+        if parsed.get("practical_applicability"):
+            formatted_parts.append(
+                "PRAKTILINE RAKENDATAVUS:\n" + str(parsed["practical_applicability"])
+            )
+        if parsed.get("problem_solving"):
+            formatted_parts.append(
+                "PROBLEEMILAHENDUS:\n" + str(parsed["problem_solving"])
+            )
+        if parsed.get("limitations"):
+            formatted_parts.append(
+                "PIIRANGUD:\n" + str(parsed["limitations"])
+            )
+        if "relevance_score" in parsed:
+            formatted_parts.append(
+                f"RELEVANTSUSE SKOOR: {parsed['relevance_score']}/10"
+            )
+
+        return "\n\n".join(formatted_parts) if formatted_parts else None
+
+    # Kui JSON ei õnnestunud → kasuta regex-i
+    print("⚠️ Using regex fallback")
+    
+    # Regex peab nüüd käsitlema newline't – kasuta re.DOTALL
+    match = re.search(r'"theoretical_contribution":\s*"(.*?)"(?=\s*,|\s*})', txt, flags=re.DOTALL)
+    if match:
+        content = match.group(1).replace('\\n', '\n').replace('\\"', '"')
+        if content:
+            formatted_parts.append(f"TEOREETILINE PANUS:\n{content}")
+
+    match = re.search(r'"practical_applicability":\s*"(.*?)"(?=\s*,|\s*})', txt, flags=re.DOTALL)
+    if match:
+        content = match.group(1).replace('\\n', '\n').replace('\\"', '"')
+        if content:
+            formatted_parts.append(f"PRAKTILINE RAKENDATAVUS:\n{content}")
+
+    match = re.search(r'"problem_solving":\s*"(.*?)"(?=\s*,|\s*})', txt, flags=re.DOTALL)
+    if match:
+        content = match.group(1).replace('\\n', '\n').replace('\\"', '"')
+        if content:
+            formatted_parts.append(f"PROBLEEMILAHENDUS:\n{content}")
+
+    match = re.search(r'"limitations":\s*"(.*?)"(?=\s*,|\s*})', txt, flags=re.DOTALL)
+    if match:
+        content = match.group(1).replace('\\n', '\n').replace('\\"', '"')
+        if content:
+            formatted_parts.append(f"PIIRANGUD:\n{content}")
+
+    match = re.search(r'"relevance_score":\s*(\d+(?:\.\d+)?)', txt)
+    if match:
+        score = match.group(1)
+        formatted_parts.append(f"RELEVANTSUSE SKOOR: {score}/10")
+
+    return "\n\n".join(formatted_parts) if formatted_parts else None
+
+
+# ============================================================================
+# WEAVIATE ANDMEBAASIST PÄRING
+# ============================================================================
 
 def get_all_articles_from_weaviate():
-    """Toob kõik artiklid Weaviate'ist"""
+    """Toob kõik artiklid Weaviate andmebaasist"""
     client = WeaviateClient()
     articles = []
-    
+
     try:
         collection = client.client.collections.get("ScientificArticle")
-        
+
         # Loendi kokku
         count_response = collection.aggregate.over_all(total_count=True)
         total = count_response.total_count
-        
         print(f"Weaviate'is leidsin {total} artiklit")
-        
+
         if total > 0:
             # Toob kõik artiklid
             response = collection.query.fetch_objects(limit=total)
-            
+
             for obj in response.objects:
                 try:
                     article = {
@@ -258,73 +381,35 @@ def get_all_articles_from_weaviate():
                         'summary_et': clean_markdown_for_pdf(obj.properties.get('summary_et', '')),
                         'key_concepts': [clean_markdown_for_pdf(c) for c in obj.properties.get('key_concepts', [])],
                         'methods_used': [clean_markdown_for_pdf(m) for m in obj.properties.get('methods_used', [])],
-                        'transport_context': parse_transport_context(obj.properties.get('transport_context', {})),
+                        'transport_context': obj.properties.get('transport_context', {}),
                         'relevance_score': obj.properties.get('relevance_score', 'N/A'),
                         'processing_date': obj.properties.get('processing_date', ''),
                         'source_file': obj.properties.get('source_file', '')
                     }
                     articles.append(article)
+
                 except Exception as e:
-                    print(f"  Viga artikli {obj.properties.get('article_id', 'unknown')} töötlemisel: {e}")
-                    # Lisa artikel ilma puhastuseta
-                    article = {
-                        'article_id': obj.properties.get('article_id', 'N/A'),
-                        'title': str(obj.properties.get('title', 'N/A')),
-                        'authors': obj.properties.get('authors', []),
-                        'year': obj.properties.get('year', 'N/A'),
-                        'journal': str(obj.properties.get('journal', 'N/A')),
-                        'doi': obj.properties.get('doi', ''),
-                        'abstract_en': str(obj.properties.get('abstract_en', '')),
-                        'summary_et': str(obj.properties.get('summary_et', '')),
-                        'key_concepts': [str(c) for c in obj.properties.get('key_concepts', [])],
-                        'methods_used': [str(m) for m in obj.properties.get('methods_used', [])],
-                        'transport_context': str(obj.properties.get('transport_context', {})),
-                        'relevance_score': obj.properties.get('relevance_score', 'N/A'),
-                        'processing_date': obj.properties.get('processing_date', ''),
-                        'source_file': obj.properties.get('source_file', '')
-                    }
-                    articles.append(article)
-                
+                    print(f"⚠️ Viga artikli {obj.properties.get('article_id', 'unknown')} töötlemisel: {e}")
+                    # Jätka järgmise artikliga
+                    continue
+
     except Exception as e:
-        print(f"Viga artiklite toomisel: {e}")
-        import traceback
+        print(f"❌ Viga artiklite toomisel: {e}")
         traceback.print_exc()
+
     finally:
         client.close()
-    
+
     return articles
 
-def format_summary_for_pdf(summary):
-    """Vorminda kokkuvõte PDF-ile sobivaks"""
-    if not summary:
-        return ""
-    
-    # Kui ei ole string, konverteeri
-    if not isinstance(summary, str):
-        summary = str(summary)
-    
-    # Eemalda kõik vormindus ja tee lihtsaks tekstiks
-    summary = clean_markdown_for_pdf(summary)
-    
-    # Lisa uued read peamiste sektsioonide ette
-    summary = summary.replace('1. ARTIKLI PEAMISED PUNKTID:', '\n1. ARTIKLI PEAMISED PUNKTID:\n')
-    summary = summary.replace('2. KASUTATUD MEETODID:', '\n\n2. KASUTATUD MEETODID:\n')
-    summary = summary.replace('3. PEAMISED TULEMUSED:', '\n\n3. PEAMISED TULEMUSED:\n')
-    summary = summary.replace('4. JÄRELDUSED JA SOOVITUSED:', '\n\n4. JÄRELDUSED JA SOOVITUSED:\n')
-    summary = summary.replace('5. TRANSFORDIPLANEERIMISE KONTEKST:', '\n\n5. TRANSFORDIPLANEERIMISE KONTEKST:\n')
-    
-    # Asenda liigsed reavahetused
-    summary = re.sub(r'\n{3,}', '\n\n', summary)
-    
-    # Lõika liiga pikk tekst
-    if len(summary) > 4000:
-        summary = summary[:4000] + "... [kokkuvõte lõigatud, liiga pikk]"
-    
-    return summary
+
+# ============================================================================
+# PDF GENEREERIMINE
+# ============================================================================
 
 def create_pdf_from_articles(articles, output_filename):
     """Loob PDF faili artiklitest"""
-    
+
     # Loo PDF dokument
     doc = SimpleDocTemplate(
         output_filename,
@@ -334,10 +419,10 @@ def create_pdf_from_articles(articles, output_filename):
         topMargin=72,
         bottomMargin=72
     )
-    
+
     # Stiilide loomine
     styles = getSampleStyleSheet()
-    
+
     # Kohandatud stiilid
     title_style = ParagraphStyle(
         'CustomTitle',
@@ -347,7 +432,7 @@ def create_pdf_from_articles(articles, output_filename):
         textColor=colors.HexColor('#2c3e50'),
         alignment=TA_LEFT
     )
-    
+
     subtitle_style = ParagraphStyle(
         'CustomSubtitle',
         parent=styles['Heading2'],
@@ -356,7 +441,7 @@ def create_pdf_from_articles(articles, output_filename):
         textColor=colors.HexColor('#34495e'),
         alignment=TA_LEFT
     )
-    
+
     section_style = ParagraphStyle(
         'CustomSection',
         parent=styles['Heading3'],
@@ -366,16 +451,16 @@ def create_pdf_from_articles(articles, output_filename):
         textColor=colors.HexColor('#7f8c8d'),
         alignment=TA_LEFT
     )
-    
+
     normal_style = ParagraphStyle(
         'CustomNormal',
         parent=styles['Normal'],
         fontSize=10,
         spaceAfter=6,
         alignment=TA_JUSTIFY,
-        leading=14  # Reavahe
+        leading=14
     )
-    
+
     metadata_style = ParagraphStyle(
         'CustomMetadata',
         parent=styles['Normal'],
@@ -384,44 +469,45 @@ def create_pdf_from_articles(articles, output_filename):
         textColor=colors.HexColor('#5d6d7e'),
         alignment=TA_LEFT
     )
-    
+
     # Elementide kogumine
     elements = []
-    
+
     # Pealkiri ja kokkuvõte
     elements.append(Paragraph("TEADUSARTIKLITE ANDMEBAAS", title_style))
     elements.append(Spacer(1, 12))
-    
+
     today = datetime.now().strftime("%d.%m.%Y %H:%M")
     elements.append(Paragraph(f"Eksporditud: {today}", metadata_style))
     elements.append(Paragraph(f"Artikleid kokku: {len(articles)}", metadata_style))
     elements.append(Spacer(1, 24))
-    
+
+    # ========================================================================
     # Iga artikli jaoks
+    # ========================================================================
+
     for i, article in enumerate(articles):
-        # Artikli pealkiri
+
+        # ARTIKLI PEALKIRI
         elements.append(Paragraph(f"{i+1}. {article['title']}", title_style))
-        
-        # Autorid
+        print(f"✅ {i+1}. {article['title']}")
+
+        # AUTORID
         if article['authors']:
             authors_text = ", ".join(article['authors'])
-            elements.append(Paragraph(f"<b>Autorid:</b> {authors_text}", subtitle_style))
-        
-        # Metaandmed tabelina
+            elements.append(Paragraph(f"Autorid: {authors_text}", subtitle_style))
+
+        # METAANDMED (aasta, žurnaal, DOI, relevantsus)
         metadata_data = []
-        
         if article['year'] and article['year'] != 'N/A':
             metadata_data.append(['Aasta:', str(article['year'])])
-        
         if article['journal'] and article['journal'] != 'N/A':
             metadata_data.append(['Žurnaal:', article['journal']])
-        
         if article['doi']:
             metadata_data.append(['DOI:', article['doi']])
-        
         if article['relevance_score'] and article['relevance_score'] != 'N/A':
             metadata_data.append(['Relevantsus:', f"{article['relevance_score']}/10"])
-        
+
         if metadata_data:
             metadata_table = Table(metadata_data, colWidths=[2*cm, 12*cm])
             metadata_table.setStyle(TableStyle([
@@ -434,18 +520,16 @@ def create_pdf_from_articles(articles, output_filename):
             ]))
             elements.append(metadata_table)
             elements.append(Spacer(1, 12))
-        
-        # Võtmesõnad ja meetodid
+
+        # VÕTMESÕNAD JA MEETODID
         tags_data = []
-        
         if article['key_concepts']:
-            concepts_text = ", ".join(article['key_concepts'][:10])  # Piirangu 10 mõistele
+            concepts_text = ", ".join(article['key_concepts'][:10])  # Piirang: 10 mõiste
             tags_data.append(['Võtmesõnad:', concepts_text])
-        
         if article['methods_used']:
             methods_text = ", ".join(article['methods_used'])
             tags_data.append(['Meetodid:', methods_text])
-        
+
         if tags_data:
             tags_table = Table(tags_data, colWidths=[2*cm, 12*cm])
             tags_table.setStyle(TableStyle([
@@ -459,165 +543,154 @@ def create_pdf_from_articles(articles, output_filename):
             ]))
             elements.append(tags_table)
             elements.append(Spacer(1, 12))
-        
-        # Abstrakt
+
+        # ABSTRAKT (inglise keeles)
         if article['abstract_en']:
-            elements.append(Paragraph("<b>ABSTRAKT (inglise keeles):</b>", section_style))
+            elements.append(Paragraph("ABSTRAKT (inglise keeles):", section_style))
             abstract_text = article['abstract_en']
             if len(abstract_text) > 800:
                 abstract_text = abstract_text[:800] + "..."
             elements.append(Paragraph(abstract_text, normal_style))
             elements.append(Spacer(1, 12))
-        
-        # Kokkuvõte
+
+        # KOKKUVÕTE (eesti keeles)
         if article['summary_et']:
-            elements.append(Paragraph("<b>KOKKUVÕTE (eesti keeles):</b>", section_style))
-            
-            # Formateeri kokkuvõte PDF-ile
+            elements.append(Paragraph("KOKKUVÕTE (eesti keeles):", section_style))
             summary = format_summary_for_pdf(article['summary_et'])
-            
-            # Kasuta lihtsat tekstiparagraphi
             elements.append(Paragraph(summary, normal_style))
             elements.append(Spacer(1, 12))
-        
-        # Transpordi kontekst
+
+        # ====================================================================
+        # TRANSPORDI PLANEERIMISE KONTEKST
+        # ====================================================================
+
         if article['transport_context']:
-            # Debugimiseks
-            debugger_data = str(article['transport_context'])
-            print("-----------  \"" + article['title'] + "\"  -----------")
-            print("-----------  article['transport_context']  -------------")
-            print(debugger_data)
-            elements.append(Paragraph("<b>TRANSFORDIPLANEERIMISE KONTEKST:</b>", section_style))
-            context_text = format_context_for_pdf(article['transport_context']['relevance_score'])
+            elements.append(Paragraph("TRANSFORDIPLANEERIMISE KONTEKST:", section_style))
+
+            # DEBUG-REA – prindi üks-ühele objekt konsooli
+            #print("DEBUG transport_context:", article['article_id'], article['transport_context'])
+
+            context_text = format_transport_context(article['transport_context'])
+            #print(f"DEBUG context_text returned: {context_text}")  # <-- LISA SEE RIDA
+
             if context_text:
-                elements.append(Paragraph("RELEVANTSUSE SKOOR: " + context_text, normal_style))
-                elements.append(Spacer(1, 1))
-            
-            elements.append(Paragraph("<b>ANALÜÜS:</b>", normal_style))
-            analysis_text = article['transport_context']['analysis']
-            # Proovi leida theoretical_contribution regex'iga
-            match = re.search(r'"theoretical_contribution":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text)
-            if match:
-                context_text = match.group(1)
-                if context_text:
-                    elements.append(Paragraph("<b>TEOREETILINE PANUS:</b>", normal_style))
-                    elements.append(Paragraph(context_text, normal_style))
-            # Proovi leida practical_applicability regex'iga
-            match = re.search(r'"practical_applicability":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text)
-            if match:
-                context_text = match.group(1)
-                if context_text:
-                    elements.append(Paragraph("<b>PRAKTILINE RAKENDATAVUS:</b>", normal_style))
-                    elements.append(Paragraph(context_text, normal_style))
-            # Proovi leida problem_solving regex'iga
-            match = re.search(r'"problem_solving":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text)
-            if match:
-                context_text = match.group(1)
-                if context_text:
-                    elements.append(Paragraph("<b>PROBLEEMILAHENDUS:</b>", normal_style))
-                    elements.append(Paragraph(context_text, normal_style))
-            # Proovi leida limitations regex'iga
-            match = re.search(r'"limitations":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text)
-            if match:
-                context_text = match.group(1)
-                if context_text:
-                    elements.append(Paragraph("<b>PIIRANGUD:</b>", normal_style))
-                    elements.append(Paragraph(context_text, normal_style))
-            # Proovi leida relevance_score regex'iga
-            match = re.search(r'"relevance_score":\s*(\d+(?:\.\d+)?)', analysis_text)
-            if match:
-                context_text = match.group(1)
-                if context_text:
-                    elements.append(Paragraph("<b>RELEVANTSUSE SKOOR:</b> " + context_text, normal_style))
-        
-        # Allikfail ja töötlemise info
+                # Jaga osadeks ja lisa eraldi paragrahfidena
+                parts = context_text.split('\n\n')  # Jaga tühjast reaga
+                for part in parts:
+                    if part.strip():
+                        # Asenda \n <br/> tag'iga
+                        part_html = part.replace('\n', '<br/>')
+                        try:
+                            elements.append(Paragraph(part_html, normal_style))
+                            elements.append(Spacer(1, 6))
+                        except Exception as e:
+                            print(f"❌ Failed to add part to PDF: {e}")
+                            # Kui HTML tag ei tööta, proovi ilma
+                            part_plain = part.replace('\n', ' ')
+                            elements.append(Paragraph(part_plain, normal_style))
+                            elements.append(Spacer(1, 6))
+                
+                print("✅ Context added to PDF successfully")
+            else:
+                elements.append(Paragraph("Analüüsi andmed puuduvad", normal_style))
+
+            elements.append(Spacer(1, 12))
+
+
+        # ====================================================================
+        # FOOTER INFO (allikfail, töötlemise kuupäev)
+        # ====================================================================
+
         footer_info = []
         if article['source_file']:
             source_name = os.path.basename(article['source_file'])
             footer_info.append(f"Allikfail: {source_name}")
-        
+
         if article['processing_date']:
-            # Proovi parsida kuupäeva
             try:
-                # Eemalda mikrosekundid kui on
                 date_str = article['processing_date']
                 if '.' in date_str:
                     date_str = date_str.split('.')[0]
                 date_str = date_str.replace('Z', '+00:00')
                 date_obj = datetime.fromisoformat(date_str)
                 footer_info.append(f"Töödeldud: {date_obj.strftime('%d.%m.%Y %H:%M')}")
-            except Exception as e:
-                # Kui ei õnnestu parsida, kuva algne string (lõigatud)
+            except Exception:
                 footer_info.append(f"Töödeldud: {article['processing_date'][:19]}")
-        
+
         if footer_info:
             elements.append(Spacer(1, 6))
             elements.append(Paragraph(" | ".join(footer_info), metadata_style))
-        
+
         # Lisa lehevahetus (välja arvatud viimase artikli puhul)
         if i < len(articles) - 1:
             elements.append(PageBreak())
         else:
             elements.append(Spacer(1, 24))
-    
-    # Lisa lõppinfo
+
+    # LÕPPINFO
     elements.append(Paragraph("=" * 80, metadata_style))
     elements.append(Spacer(1, 6))
     elements.append(Paragraph(f"Kokku eksporditud artikleid: {len(articles)}", metadata_style))
     elements.append(Paragraph("Eksporditud Weaviate teadusartiklite andmebaasist", metadata_style))
     elements.append(Paragraph(f"PDF genereeritud: {datetime.now().strftime('%d.%m.%Y %H:%M:%S')}", metadata_style))
-    
+
     # Koosta PDF
     doc.build(elements)
-    
+
     return len(articles)
 
+
+# ============================================================================
+# PEAMINE FUNKTSIOON
+# ============================================================================
+
 def main():
-    """Peamine funktsioon"""
+    """Peamine funktsioon - käivitab kogu protsessi"""
+
     print("=" * 60)
     print("ARTIKLITE EKSPORT PDF FAILI")
     print("=" * 60)
-    
+
     # Toob artiklid Weaviate'ist
     print("Toon artikleid Weaviate'ist...")
     articles = get_all_articles_from_weaviate()
-    
+
     if not articles:
-        print("Ei leidnud ühtegi artiklit Weaviate'is!")
+        print("Ei leidnud ühtegi artiklit Weaviate'is!")
         return
-    
-    print(f"Leidsin {len(articles)} artiklit")
-    
+
+    print(f"Leidsin {len(articles)} artiklit")
+
     # Genereeri PDF failinimi
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     output_dir = "./data/exports"
     os.makedirs(output_dir, exist_ok=True)
     output_filename = os.path.join(output_dir, f"artiklid_eksport_{timestamp}.pdf")
-    
+
     # Loo PDF
     print(f"Loon PDF faili: {output_filename}")
+
     try:
         article_count = create_pdf_from_articles(articles, output_filename)
-        
+
         print("=" * 60)
         print(f"✅ VALMIS! Loodud PDF fail: {output_filename}")
-        print(f"   - Eksporditud artikleid: {article_count}")
-        print(f"   - Faili suurus: {os.path.getsize(output_filename) / 1024:.1f} KB")
+        print(f" - Eksporditud artikleid: {article_count}")
+        print(f" - Faili suurus: {os.path.getsize(output_filename) / 1024:.1f} KB")
         print("=" * 60)
-        
-        # Näita esimese artikli pealkirja
+
         if articles:
             print("\nEsimesed artiklid:")
             for i, article in enumerate(articles[:3]):
                 title_preview = article['title']
                 if len(title_preview) > 60:
                     title_preview = title_preview[:60] + "..."
-                print(f"  {i+1}. {title_preview}")
-    
+                print(f" {i+1}. {title_preview}")
+
     except Exception as e:
         print(f"\n❌ VIGA PDF loomisel: {e}")
-        import traceback
         traceback.print_exc()
 
+
 if __name__ == "__main__":
-    main()
+    main()