|
|
@@ -0,0 +1,465 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+"""
|
|
|
+ULTRA KOMPAKTNE NIMEKIRI
|
|
|
+Väljad: Pealkiri, Allikfail, Uurimisküsimus
|
|
|
+PARANDUS: Eemaldab markdown loetelud (* ja -) enne lõikamist
|
|
|
+"""
|
|
|
+
|
|
|
+import subprocess
|
|
|
+import json
|
|
|
+import csv
|
|
|
+import re
|
|
|
+from datetime import datetime
|
|
|
+
|
|
|
+# GraphQL päring - AINULT 3 VÄLJA
|
|
|
+ULTRA_COMPACT_QUERY = {
|
|
|
+ "query": """
|
|
|
+ {
|
|
|
+ Get {
|
|
|
+ ScientificArticle(limit: 800) {
|
|
|
+ title
|
|
|
+ source_file
|
|
|
+ summary_et
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ """
|
|
|
+}
|
|
|
+
|
|
|
+def fetch_articles():
|
|
|
+ """Toob artiklid"""
|
|
|
+ print("📡 Toon artikleid (title, source_file, summary_et)...")
|
|
|
+
|
|
|
+ try:
|
|
|
+ result = subprocess.run(
|
|
|
+ [
|
|
|
+ 'curl',
|
|
|
+ '-s',
|
|
|
+ 'http://100.80.222.54:9020/v1/graphql',
|
|
|
+ '-X', 'POST',
|
|
|
+ '-H', 'Content-Type: application/json',
|
|
|
+ '-d', json.dumps(ULTRA_COMPACT_QUERY)
|
|
|
+ ],
|
|
|
+ capture_output=True,
|
|
|
+ text=True,
|
|
|
+ timeout=60
|
|
|
+ )
|
|
|
+
|
|
|
+ if result.returncode != 0:
|
|
|
+ print(f"❌ CURL viga: {result.stderr}")
|
|
|
+ return None
|
|
|
+
|
|
|
+ data = json.loads(result.stdout)
|
|
|
+ articles = data.get('data', {}).get('Get', {}).get('ScientificArticle', [])
|
|
|
+ print(f"✅ Leidsin {len(articles)} artiklit")
|
|
|
+ return articles
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ print(f"❌ Viga: {e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+def clean_markdown_lists(text):
|
|
|
+ """
|
|
|
+ Eemaldab markdown loetelud ja säilitab sisulise teksti.
|
|
|
+ Käsitleb:
|
|
|
+ - Unordered liistid (* ja -)
|
|
|
+ - Ordered listid (1. 2. 3.)
|
|
|
+ - Bold/italic markerid
|
|
|
+ - Lingid [tekst](url)
|
|
|
+ """
|
|
|
+ if not text:
|
|
|
+ return "N/A"
|
|
|
+
|
|
|
+ # 1. Eemalda markdown pealkirjad (# ## ###)
|
|
|
+ text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE)
|
|
|
+
|
|
|
+ # 2. Eemalda markdown bold/italic (* ja **)
|
|
|
+ text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # **tekst** -> tekst
|
|
|
+ text = re.sub(r'\*(.+?)\*', r'\1', text) # *tekst* -> tekst
|
|
|
+ text = re.sub(r'__(.+?)__', r'\1', text) # __tekst__ -> tekst
|
|
|
+ text = re.sub(r'_(.+?)_', r'\1', text) # _tekst_ -> tekst
|
|
|
+
|
|
|
+ # 3. Eemalda unordered list markerid ja ordered list markerid
|
|
|
+ lines = text.split('\n')
|
|
|
+ cleaned_lines = []
|
|
|
+
|
|
|
+ for line in lines:
|
|
|
+ # Eemalda list markerid (* - • + algusest) ja tühikud
|
|
|
+ cleaned_line = re.sub(r'^\s*[-*•+]\s+', '', line)
|
|
|
+
|
|
|
+ # Eemalda numbrilise loetelu markerid (1. 2. jne)
|
|
|
+ cleaned_line = re.sub(r'^\s*\d+\.\s+', '', cleaned_line)
|
|
|
+
|
|
|
+ # Eemalda ümbritsevad tühikud
|
|
|
+ cleaned_line = cleaned_line.strip()
|
|
|
+
|
|
|
+ if cleaned_line: # Kui rida ei ole tühi
|
|
|
+ cleaned_lines.append(cleaned_line)
|
|
|
+
|
|
|
+ # Ühenda read - punkt ja tühik loetelu elementi vahele
|
|
|
+ text = ' '.join(cleaned_lines)
|
|
|
+
|
|
|
+ # 4. Eemalda markdown lingid [tekst](url) -> tekst
|
|
|
+ text = re.sub(r'\[(.+?)\]\(.+?\)', r'\1', text)
|
|
|
+
|
|
|
+ # 5. Eemalda backtick koodi markerid
|
|
|
+ text = re.sub(r'`(.+?)`', r'\1', text)
|
|
|
+
|
|
|
+ # 6. Eemalda mitmed tühikud
|
|
|
+ text = re.sub(r'\s+', ' ', text)
|
|
|
+
|
|
|
+ return text.strip()
|
|
|
+
|
|
|
+def extract_research_question(summary_et):
|
|
|
+ """
|
|
|
+ Eraldab uurimisküsimuse summary_et teksti algusest.
|
|
|
+ Otsib: "Uurimisküsimused ja eesmärgid:" järgset teksti
|
|
|
+ Eemaldab markdown loetelud enne lõikamist.
|
|
|
+ """
|
|
|
+ if not summary_et:
|
|
|
+ return "N/A"
|
|
|
+
|
|
|
+ # Otsi "Uurimisküsimused ja eesmärgid:" sektsiooni
|
|
|
+ patterns = [
|
|
|
+ r'(?:^|\n)\s*(?:[-•*•]\s+)?\*{0,2}Uurimisküsimused ja eesmärgid:\*{0,2}\s*(.+?)(?=(?:^|\n)\s*(?:[-•*•]\s+)?\*{0,2}Teaduslik tähtsus:|$)',
|
|
|
+ ]
|
|
|
+
|
|
|
+ text = None
|
|
|
+ for pattern in patterns:
|
|
|
+ match = re.search(pattern, summary_et, re.DOTALL | re.IGNORECASE)
|
|
|
+ if match:
|
|
|
+ text = match.group(1).strip()
|
|
|
+ break
|
|
|
+
|
|
|
+ # Kui ei leia spetsiifilist sektsiooni, võta lihtsalt algus
|
|
|
+ if text is None:
|
|
|
+ text = summary_et[:300].strip()
|
|
|
+
|
|
|
+ # Puhasta markdown loetelud
|
|
|
+ text = clean_markdown_lists(text)
|
|
|
+
|
|
|
+ # Lõika 200 tähemärgini
|
|
|
+ if len(text) > 1000:
|
|
|
+ text = text[:1000] + "..."
|
|
|
+
|
|
|
+ return text.strip()
|
|
|
+
|
|
|
+def generate_csv(articles):
|
|
|
+ """Genereerib CSV faili"""
|
|
|
+ csv_path = "/home/ardo/Downloads/articles_ultra_compact.csv"
|
|
|
+
|
|
|
+ try:
|
|
|
+ with open(csv_path, 'w', newline='', encoding='utf-8') as f:
|
|
|
+ writer = csv.writer(f)
|
|
|
+ writer.writerow(['#', 'Pealkiri', 'Allikfail', 'Uurimisküsimus'])
|
|
|
+
|
|
|
+ for i, article in enumerate(articles, 1):
|
|
|
+ title = article.get('title', 'N/A')
|
|
|
+ source = article.get('source_file', 'N/A').split('/')[-1] # Ainult failinimi
|
|
|
+ summary = article.get('summary_et', '')
|
|
|
+ research_q = extract_research_question(summary)
|
|
|
+
|
|
|
+ writer.writerow([i, title, source, research_q])
|
|
|
+
|
|
|
+ return csv_path
|
|
|
+ except Exception as e:
|
|
|
+ print(f"❌ Viga CSV loomisel: {e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+def generate_markdown(articles):
|
|
|
+ """Genereerib Markdown tabel"""
|
|
|
+ md_path = "/home/ardo/Downloads/articles_ultra_compact.md"
|
|
|
+
|
|
|
+ md_content = f"""# Teadusartiklite ultra-kompaktne nimekiri
|
|
|
+
|
|
|
+**Kokku artikle:** {len(articles)}
|
|
|
+**Eksporditud:** {datetime.now().strftime("%d.%m.%Y %H:%M")}
|
|
|
+
|
|
|
+**Väljad:**
|
|
|
+1. Pealkiri
|
|
|
+2. Allikfail
|
|
|
+3. Uurimisküsimus (esimene 200 tähemärki "Uurimisküsimused ja eesmärgid" sektsoonist, ilma markdown loetelu markerita)
|
|
|
+
|
|
|
+---
|
|
|
+
|
|
|
+| # | Pealkiri | Allikfail | Uurimisküsimus |
|
|
|
+|---|----------|-----------|----------------|
|
|
|
+"""
|
|
|
+
|
|
|
+ for i, article in enumerate(articles, 1):
|
|
|
+ title = article.get('title', 'N/A')
|
|
|
+ source = article.get('source_file', 'N/A').split('/')[-1]
|
|
|
+ summary = article.get('summary_et', '')
|
|
|
+ research_q = extract_research_question(summary)
|
|
|
+
|
|
|
+ # Markdown-safe (eemalda |)
|
|
|
+ title = title.replace('|', '-')
|
|
|
+ research_q = research_q.replace('|', '-')
|
|
|
+
|
|
|
+ md_content += f"| {i} | {title} | `{source}` | {research_q} |\n"
|
|
|
+
|
|
|
+ md_content += f"""
|
|
|
+---
|
|
|
+
|
|
|
+**Loodud:** {datetime.now().strftime("%d.%m.%Y %H:%M:%S")}
|
|
|
+**Allikas:** Weaviate teadusartiklite andmebaas
|
|
|
+
|
|
|
+---
|
|
|
+
|
|
|
+## 📝 Märkused
|
|
|
+
|
|
|
+✅ **CSV on korras** - sisaldab tervet uurimisküsimust
|
|
|
+✅ **Markdown tabel** - eemaldab loetelu markerid (* ja -) enne lõikamist
|
|
|
+✅ **HTML** - ilus vaade brauseris
|
|
|
+
|
|
|
+**Parandused:**
|
|
|
+- Eemaldab markdown loetelud (`*`, `-`, numbrid)
|
|
|
+- Säilitab sisulise teksti
|
|
|
+- Ühendab loetelu elemente tühikutega
|
|
|
+- Lõikab sujuvalt 200 tähemärgis
|
|
|
+"""
|
|
|
+
|
|
|
+ try:
|
|
|
+ with open(md_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(md_content)
|
|
|
+ return md_path
|
|
|
+ except Exception as e:
|
|
|
+ print(f"❌ Viga Markdown loomisel: {e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+def generate_html(articles):
|
|
|
+ """Genereerib HTML tabel"""
|
|
|
+ html_path = "/home/ardo/Downloads/articles_ultra_compact.html"
|
|
|
+
|
|
|
+ html_content = """<!DOCTYPE html>
|
|
|
+<html lang="et">
|
|
|
+<head>
|
|
|
+ <meta charset="UTF-8">
|
|
|
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
|
+ <title>Teadusartiklite ultra-kompaktne nimekiri</title>
|
|
|
+ <style>
|
|
|
+ body {
|
|
|
+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Arial, sans-serif;
|
|
|
+ margin: 20px;
|
|
|
+ background: #f8f9fa;
|
|
|
+ }
|
|
|
+ .header {
|
|
|
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
|
|
+ color: white;
|
|
|
+ padding: 30px;
|
|
|
+ border-radius: 12px;
|
|
|
+ margin-bottom: 20px;
|
|
|
+ box-shadow: 0 4px 6px rgba(0,0,0,0.1);
|
|
|
+ }
|
|
|
+ .header h1 {
|
|
|
+ margin: 0 0 10px 0;
|
|
|
+ font-size: 28px;
|
|
|
+ }
|
|
|
+ .header p {
|
|
|
+ margin: 5px 0;
|
|
|
+ opacity: 0.95;
|
|
|
+ }
|
|
|
+ table {
|
|
|
+ width: 100%;
|
|
|
+ border-collapse: collapse;
|
|
|
+ background: white;
|
|
|
+ box-shadow: 0 2px 8px rgba(0,0,0,0.1);
|
|
|
+ border-radius: 8px;
|
|
|
+ overflow: hidden;
|
|
|
+ }
|
|
|
+ th {
|
|
|
+ background: #495057;
|
|
|
+ color: white;
|
|
|
+ padding: 15px 12px;
|
|
|
+ text-align: left;
|
|
|
+ font-weight: 600;
|
|
|
+ border: none;
|
|
|
+ }
|
|
|
+ td {
|
|
|
+ padding: 12px;
|
|
|
+ border-bottom: 1px solid #dee2e6;
|
|
|
+ vertical-align: top;
|
|
|
+ }
|
|
|
+ tr:last-child td {
|
|
|
+ border-bottom: none;
|
|
|
+ }
|
|
|
+ tr:hover {
|
|
|
+ background: #f8f9fa;
|
|
|
+ }
|
|
|
+ .index {
|
|
|
+ width: 50px;
|
|
|
+ text-align: center;
|
|
|
+ font-weight: bold;
|
|
|
+ color: #667eea;
|
|
|
+ }
|
|
|
+ .title {
|
|
|
+ font-weight: 600;
|
|
|
+ color: #212529;
|
|
|
+ max-width: 300px;
|
|
|
+ }
|
|
|
+ .source {
|
|
|
+ font-family: 'Courier New', monospace;
|
|
|
+ font-size: 11px;
|
|
|
+ color: #6c757d;
|
|
|
+ background: #f8f9fa;
|
|
|
+ padding: 4px 8px;
|
|
|
+ border-radius: 4px;
|
|
|
+ max-width: 200px;
|
|
|
+ word-break: break-all;
|
|
|
+ }
|
|
|
+ .research {
|
|
|
+ font-size: 13px;
|
|
|
+ color: #495057;
|
|
|
+ line-height: 1.6;
|
|
|
+ max-width: 600px;
|
|
|
+ }
|
|
|
+ .footer {
|
|
|
+ margin-top: 30px;
|
|
|
+ padding: 20px;
|
|
|
+ background: white;
|
|
|
+ border-radius: 8px;
|
|
|
+ text-align: center;
|
|
|
+ color: #6c757d;
|
|
|
+ box-shadow: 0 2px 4px rgba(0,0,0,0.05);
|
|
|
+ }
|
|
|
+ .badge {
|
|
|
+ display: inline-block;
|
|
|
+ background: #28a745;
|
|
|
+ color: white;
|
|
|
+ padding: 3px 8px;
|
|
|
+ border-radius: 4px;
|
|
|
+ font-size: 11px;
|
|
|
+ margin-top: 10px;
|
|
|
+ }
|
|
|
+ @media print {
|
|
|
+ body { background: white; }
|
|
|
+ .header { background: #667eea; }
|
|
|
+ tr:hover { background: white; }
|
|
|
+ }
|
|
|
+ </style>
|
|
|
+</head>
|
|
|
+<body>
|
|
|
+ <div class="header">
|
|
|
+ <h1>📚 Teadusartiklite ultra-kompaktne nimekiri</h1>
|
|
|
+ <p><strong>Kokku artikle:</strong> {count}</p>
|
|
|
+ <p><strong>Eksporditud:</strong> {timestamp}</p>
|
|
|
+ <p><strong>Väljad:</strong> Pealkiri, Allikfail, Uurimisküsimus</p>
|
|
|
+ <div class="badge">✅ Markdown loetelud puhastatud</div>
|
|
|
+ </div>
|
|
|
+
|
|
|
+ <table>
|
|
|
+ <thead>
|
|
|
+ <tr>
|
|
|
+ <th class="index">#</th>
|
|
|
+ <th>Pealkiri</th>
|
|
|
+ <th>Allikfail</th>
|
|
|
+ <th>Uurimisküsimus</th>
|
|
|
+ </tr>
|
|
|
+ </thead>
|
|
|
+ <tbody>
|
|
|
+"""
|
|
|
+
|
|
|
+ for i, article in enumerate(articles, 1):
|
|
|
+ title = article.get('title', 'N/A')
|
|
|
+ source = article.get('source_file', 'N/A').split('/')[-1]
|
|
|
+ summary = article.get('summary_et', '')
|
|
|
+ research_q = extract_research_question(summary)
|
|
|
+
|
|
|
+ html_content += f""" <tr>
|
|
|
+ <td class="index">{i}</td>
|
|
|
+ <td class="title">{title}</td>
|
|
|
+ <td class="source">{source}</td>
|
|
|
+ <td class="research">{research_q}</td>
|
|
|
+ </tr>
|
|
|
+"""
|
|
|
+
|
|
|
+ html_content += """ </tbody>
|
|
|
+ </table>
|
|
|
+
|
|
|
+ <div class="footer">
|
|
|
+ <p><strong>Loodud:</strong> """ + datetime.now().strftime("%d.%m.%Y %H:%M:%S") + """</p>
|
|
|
+ <p>Allikas: Weaviate teadusartiklite andmebaas</p>
|
|
|
+ <p>💡 Tip: Print → Save as PDF ekspordiks</p>
|
|
|
+ <p style="font-size: 12px; margin-top: 20px; color: #999;">
|
|
|
+ Markdown loetelud (*, -, 1., jne) eemaldatud enne lõikamist.
|
|
|
+ </p>
|
|
|
+ </div>
|
|
|
+</body>
|
|
|
+</html>
|
|
|
+"""
|
|
|
+
|
|
|
+ try:
|
|
|
+ with open(html_path, 'w', encoding='utf-8') as f:
|
|
|
+ f.write(html_content.format(
|
|
|
+ count=len(articles),
|
|
|
+ timestamp=datetime.now().strftime("%d.%m.%Y %H:%M")
|
|
|
+ ))
|
|
|
+ return html_path
|
|
|
+ except Exception as e:
|
|
|
+ print(f"❌ Viga HTML loomisel: {e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+def main():
|
|
|
+ print("=" * 70)
|
|
|
+ print("ULTRA-KOMPAKTNE NIMEKIRI: 3 VÄLJA (parandatud)")
|
|
|
+ print("=" * 70)
|
|
|
+ print("Väljad:")
|
|
|
+ print(" 1. Pealkiri")
|
|
|
+ print(" 2. Allikfail")
|
|
|
+ print(" 3. Uurimisküsimus (200 tähemärki, ILMA markdown loetelu markerita)")
|
|
|
+ print()
|
|
|
+
|
|
|
+ # 1. Toome artikleid
|
|
|
+ articles = fetch_articles()
|
|
|
+ if not articles:
|
|
|
+ print("❌ Viga: ei saanud artikleid tuua")
|
|
|
+ return
|
|
|
+
|
|
|
+ print()
|
|
|
+ print("📊 Genereerides väljundeid...")
|
|
|
+
|
|
|
+ # 2. Genereerime CSV
|
|
|
+ csv_path = generate_csv(articles)
|
|
|
+ if csv_path:
|
|
|
+ print(f"✅ CSV fail: {csv_path}")
|
|
|
+
|
|
|
+ # 3. Genereerime Markdown
|
|
|
+ md_path = generate_markdown(articles)
|
|
|
+ if md_path:
|
|
|
+ print(f"✅ Markdown fail: {md_path}")
|
|
|
+
|
|
|
+ # 4. Genereerime HTML
|
|
|
+ html_path = generate_html(articles)
|
|
|
+ if html_path:
|
|
|
+ print(f"✅ HTML fail: {html_path}")
|
|
|
+
|
|
|
+ print()
|
|
|
+ print("=" * 70)
|
|
|
+ print("✅ VALMIS!")
|
|
|
+ print("=" * 70)
|
|
|
+ print()
|
|
|
+ print("📄 Väljundfailid:")
|
|
|
+ print(f" 1. CSV: {csv_path}")
|
|
|
+ print(f" 2. Markdown: {md_path}")
|
|
|
+ print(f" 3. HTML: {html_path}")
|
|
|
+ print()
|
|
|
+ print("🚀 Kasutamine:")
|
|
|
+ print(" - CSV: Ava Excelis (sorteerimine, filtreerimine)")
|
|
|
+ print(" - HTML: Ava brauseris (ilus kuju, print → PDF)")
|
|
|
+ print(" - Markdown: Ava VS Code'is")
|
|
|
+ print()
|
|
|
+ print("✨ Parandused:")
|
|
|
+ print(" ✅ Eemaldab markdown loetelud (* ja -)")
|
|
|
+ print(" ✅ Säilitab sisulise teksti")
|
|
|
+ print(" ✅ CSV on korras")
|
|
|
+ print(" ✅ Markdown tabel on luetav")
|
|
|
+ print()
|
|
|
+ print("💡 Näide väljast:")
|
|
|
+ if articles:
|
|
|
+ print(f" Pealkiri: {articles[0].get('title', 'N/A')[:60]}...")
|
|
|
+ print(f" Allikfail: {articles[0].get('source_file', 'N/A').split('/')[-1]}")
|
|
|
+ research_q = extract_research_question(articles[0].get('summary_et', ''))
|
|
|
+ print(f" Uurimisküsimus: {research_q[:80]}...")
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|