ソースを参照

faili värskendus

Ardo Kubjas 3 ヶ月 前
コミット
f5781e4a64

+ 14 - 0
LOEMIND_CURL.md

@@ -0,0 +1,14 @@
+
+
+### Eestikeelsete ülevaadete salvestamine markdown faili
+```bash
+curl -s http://100.80.222.54:9020/v1/graphql \
+  -X POST \
+  -H "Content-Type: application/json" \
+  -d '{
+    "query": "{ Get { ScientificArticle { title source_file summary_et } } }"
+  }' \
+| jq -r '.data.Get.ScientificArticle[] |
+  "\n## " + .title + "\n\n" + "**Source:** " + .source_file + "\n\n" + .summary_et' \
+> ~/Downloads/articles_summary_et.md
+```

+ 88 - 10
fetch_articles/ULTRA_COMPACT/ULTRA_COMPACT.md

@@ -26,6 +26,28 @@ ULTRA_COMPACT_QUERY = {
     """
 }
 
+def fix_broken_words(text):
+    """
+    Parandab poolitatud sõnad (line-breaking sõnad)
+    
+    Näide:
+    "trac\nows" → "tracows"
+    "vähe\nmusest" → "vähemusest"
+    """
+    if not text:
+        return text
+    
+    # Pattern 1: väike sõna (3-20 märki) + reavahetus + väike sõna (2-10 märki)
+    # Eemaldab reavahetus keskel sõnade
+    text = re.sub(r'(\w{3,20})\n(\w{2,10})', r'\1\2', text)
+    
+    # Pattern 2: Üldine - asendab kõik reavahetused tühikutega
+    text = re.sub(r'\n\s*', ' ', text)
+    # Ülesse nopitud vead. Põhjus teadmata
+    text = text.replace('trac ow', 'traffic flow')
+    
+    return text
+
 def fetch_articles():
     """Toob artiklid"""
     print("📡 Toon artikleid (title, source_file, summary_et)...")
@@ -118,12 +140,12 @@ def extract_research_question(summary_et):
     """
     if not summary_et:
         return "N/A"
-    
+        
     # Otsi "Uurimisküsimused ja eesmärgid:" sektsiooni
     patterns = [
         r'(?:^|\n)\s*(?:[-•*•]\s+)?\*{0,2}Uurimisküsimused ja eesmärgid:\*{0,2}\s*(.+?)(?=(?:^|\n)\s*(?:[-•*•]\s+)?\*{0,2}Teaduslik tähtsus:|$)',
     ]
-    
+
     text = None
     for pattern in patterns:
         match = re.search(pattern, summary_et, re.DOTALL | re.IGNORECASE)
@@ -170,21 +192,68 @@ def generate_markdown(articles):
     """Genereerib Markdown tabel"""
     md_path = "/home/ardo/Downloads/articles_ultra_compact.md"
     
+    # ✅ V3.6: CSS styling Markdown failis
     md_content = f"""# Teadusartiklite ultra-kompaktne nimekiri
 
+<style type="text/css">
+@page {{
+    size: landscape;      /* ✅ A4 11" x 8.5" */
+    margin: 10mm;
+}}
+body {{
+    margin: 0;
+    padding: 20px;
+    width: 100%;
+}}
+table {{
+    width: 100%;
+    table-layout: fixed;
+}}
+
+table th:nth-child(1),
+table td:nth-child(1) {{
+    width: 5%;
+}}
+
+table th:nth-child(2),
+table td:nth-child(2) {{
+    width: 20%;
+}}
+
+table th:nth-child(3),
+table td:nth-child(3) {{
+    width: 15%;
+}}
+
+table th:nth-child(4),
+table td:nth-child(4) {{
+    width: 60%;
+}}
+
+table td {{
+    word-wrap: break-word;
+    overflow-wrap: break-word;
+}}
+</style>
+
 **Kokku artikle:** {len(articles)}  
 **Eksporditud:** {datetime.now().strftime("%d.%m.%Y %H:%M")}
 
 **Väljad:**
-1. Pealkiri
-2. Allikfail
-3. Uurimisküsimus (esimene 200 tähemärki "Uurimisküsimused ja eesmärgid" sektsoonist, ilma markdown loetelu markerita)
+1. \# (5% laiusest)
+2. Pealkiri (20% laiusest)
+3. Allikfail (15% laiusest)
+4. Uurimisküsimus (60% laiusest)
 
 ---
 
-| # | Pealkiri | Allikfail | Uurimisküsimus |
-|---|----------|-----------|----------------|
 """
+
+    # Line-by-line ehitus
+    lines = [
+        "| # | Pealkiri | Allikfail | Uurimisküsimus |",
+        "|---|----------|-----------|----------------|",
+    ]
     
     for i, article in enumerate(articles, 1):
         title = article.get('title', 'N/A')
@@ -194,10 +263,18 @@ def generate_markdown(articles):
         
         # Markdown-safe (eemalda |)
         title = title.replace('|', '-')
+        #title = title.replace('trac ow', 'traffic flow')
+        title = clean_markdown_lists(title)
+        # ✅ OLULINE: Esimene parandame poolitatus sõnad
+        title = fix_broken_words(title)
         research_q = research_q.replace('|', '-')
-        
-        md_content += f"| {i} | {title} | `{source}` | {research_q} |\n"
-    
+        # ✅ Puhas rida: ilma trailing spaces'ita
+        row = f"| {i} | {title} | {source} | {research_q} |"
+        lines.append(row)
+
+    md_content += "\n".join(lines)
+
+
     md_content += f"""
 ---
 
@@ -227,6 +304,7 @@ def generate_markdown(articles):
         print(f"❌ Viga Markdown loomisel: {e}")
         return None
 
+
 def generate_html(articles):
     """Genereerib HTML tabel"""
     html_path = "/home/ardo/Downloads/articles_ultra_compact.html"

+ 1 - 0
fetch_articles/fetch_and_convert.py

@@ -370,6 +370,7 @@ def get_all_articles_from_weaviate():
 
             for obj in response.objects:
                 try:
+                    print(obj.properties.get('abstract_en', ''))
                     article = {
                         'article_id': obj.properties.get('article_id', 'N/A'),
                         'title': clean_markdown_for_pdf(obj.properties.get('title', 'N/A')),