#!/usr/bin/env python3 """ Teadusartiklite CURL päring → Markdown → PDF konverter Kasutab jq asemel Pythoni JSON parsimist, et vältida HTML/JSON escaping probleeme """ import subprocess import json import re from datetime import datetime # GraphQL päring QUERY = { "query": "{ Get { ScientificArticle { title source_file summary_et transport_context } } }" } def fetch_articles(): """Toob artiklid Weaviate GraphQL API-st (KÕIK limit + offset-iga)""" print("📡 Toon artikleid GraphQL API-st...") all_articles = [] limit = 500 offset = 0 while True: query_str = f"""{{ Get {{ ScientificArticle(limit: {limit}, offset: {offset}) {{ title source_file summary_et transport_context }} }} }}""" query = {"query": query_str} try: result = subprocess.run( ['curl', '-s', 'http://100.80.222.54:9020/v1/graphql', '-X', 'POST', '-H', 'Content-Type: application/json', '-d', json.dumps(query)], capture_output=True, text=True, timeout=60 ) if result.returncode != 0: print(f"❌ CURL viga: {result.stderr}") break data = json.loads(result.stdout) articles = data.get('data', {}).get('Get', {}).get('ScientificArticle', []) if not articles: break all_articles.extend(articles) print(f" ✅ Tõin {len(articles)} artiklit (kokku: {len(all_articles)})") offset += limit except Exception as e: print(f"❌ Viga: {e}") break print(f"✅ LEIDSIN KOKKU {len(all_articles)} ARTIKLIT!") return all_articles def extract_transport_context(transport_context): """Eraldab transport_context JSON-i võtmväljad loetaval kujul""" if not transport_context: return "Andmeid pole saadaval" result_parts = [] try: # Kui transport_context on string, parsime seda if isinstance(transport_context, str): # Eemalda HTML/JSON sildid text = transport_context text = re.sub(r'```json\s*', '', text) text = re.sub(r'\s*```', '', text) text = text.strip() # Proovime JSON-i parsida try: obj = json.loads(text) except: # Kui ei õnnestu, tagastame stringi return f"```\n{text[:500]}...\n```" else: obj = transport_context # Parsime 'analysis' välja kui see on string if isinstance(obj, dict) and 'analysis' in obj: analysis = obj['analysis'] if isinstance(analysis, str): # Puhastame JSON markerid analysis = re.sub(r'```json\s*', '', analysis) analysis = re.sub(r'\s*```', '', analysis) analysis = analysis.strip() # Parsime JSON try: analysis_obj = json.loads(analysis) obj = analysis_obj except: # Kui ei õnnestu, kasutame regex'i pass # Eraldame võtmväljad if isinstance(obj, dict): # Teoreetiline panus if obj.get('theoretical_contribution'): result_parts.append( "#### Teoreetiline panus\n\n" + obj['theoretical_contribution'] ) # Praktiline rakendatavus if obj.get('practical_applicability'): result_parts.append( "#### Praktiline rakendatavus\n\n" + obj['practical_applicability'] ) # Probleemilahendus if obj.get('problem_solving'): result_parts.append( "#### Probleemilahendus\n\n" + obj['problem_solving'] ) # Piirangud if obj.get('limitations'): result_parts.append( "#### Piirangud\n\n" + obj['limitations'] ) # Relevantsuse skoor score = obj.get('relevance_score') if score is not None: result_parts.append( f"**Relevantsuse skoor:** {score}/10" ) return "\n\n".join(result_parts) if result_parts else "Andmeid pole saadaval" except Exception as e: return f"Viga parsimisega: {str(e)}" def generate_markdown(articles): """Genereerib markdown faili artiklitest""" print("✍️ Genereerin markdown faili...") # CSS lehevahetuste jaoks css_header = """ # Teadusartiklite analüüs ja transpordiplaneerimise kontekst Eksporditud: **{timestamp}** Artikleid kokku: **{count}** --- """.format( timestamp=datetime.now().strftime("%d.%m.%Y %H:%M"), count=len(articles) ) # Genereerime artiklite sektsioonid content = css_header for i, article in enumerate(articles, 1): # Artikli pealkiri ja metadata content += f"\n## {i}. {article['title']}\n\n" # Allikfail source = article.get('source_file', 'N/A') if source: source_name = source.split('/')[-1] # Võta ainult failinimi content += f"**Allikfail:** `{source_name}`\n\n" # Kokkuvõte summary = article.get('summary_et', '') if summary: content += "### Kokkuvõte (eesti keeles)\n\n" content += summary + "\n\n" # Transpordiplaneerimise kontekst transport = article.get('transport_context') if transport: content += "### Transpordiplaneerimise kontekst\n\n" context_text = extract_transport_context(transport) content += context_text + "\n\n" content += "---\n" # Lõppeinfo content += f""" ## Lõppinfo - **Eksporditud:** {datetime.now().strftime("%d.%m.%Y %H:%M:%S")} - **Kokku artikle:** {len(articles)} - **Allikas:** Weaviate teadusartiklite andmebaas Fail konverteeritud Markdown → PDF VS Code Markdown PDF laiendusega. """ return content def save_markdown(content, filepath): """Salvestab markdown failina""" try: with open(filepath, 'w', encoding='utf-8') as f: f.write(content) print(f"✅ Markdown fail salvestatud: {filepath}") return True except Exception as e: print(f"❌ Viga faili salvestamisel: {e}") return False def main(): print("=" * 60) print("TEADUSARTIKLITE EKSPORT MARKDOWN/PDF FORMAATI") print("=" * 60) # 1. Toome artikleid articles = fetch_articles() if not articles: print("❌ Viga: ei saanud artikleid tuua") return # 2. Genereerime markdown markdown = generate_markdown(articles) # 3. Salvestame #output_path = "/home/ardo/Downloads/articles_with_transport_context.md" output_path = "~/rag-demo/transpordi_artiklid/tmp/articles_with_transport_context.md" if save_markdown(markdown, output_path): print("\n" + "=" * 60) print("✅ VALMIS!") print("=" * 60) print(f"\n📄 Markdown fail: {output_path}") print("\n🚀 Järgmised sammud:") print(" 1. Avage fail VS Code'is") print(" 2. Paremklõps peal → 'Markdown PDF: Export (pdf)'") print(" 3. PDF fail luuakse samasse kausta") print("\n💡 Nipp: Iga artikkel algab uuelt lehelt!") else: print("❌ Viga: ei saanud faili salvestada") if __name__ == "__main__": main()