fetch_and_convert.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. #!/usr/bin/env python3
  2. """
  3. Teadusartiklite CURL päring → Markdown → PDF konverter
  4. Kasutab jq asemel Pythoni JSON parsimist, et vältida HTML/JSON escaping probleeme
  5. """
  6. import subprocess
  7. import json
  8. import re
  9. from datetime import datetime
  10. # GraphQL päring
  11. QUERY = {
  12. "query": "{ Get { ScientificArticle { title source_file summary_et transport_context } } }"
  13. }
  14. def fetch_articles():
  15. """Toob artiklid Weaviate GraphQL API-st (KÕIK limit + offset-iga)"""
  16. print("📡 Toon artikleid GraphQL API-st...")
  17. all_articles = []
  18. limit = 500
  19. offset = 0
  20. while True:
  21. query_str = f"""{{
  22. Get {{
  23. ScientificArticle(limit: {limit}, offset: {offset}) {{
  24. title
  25. source_file
  26. summary_et
  27. transport_context
  28. }}
  29. }}
  30. }}"""
  31. query = {"query": query_str}
  32. try:
  33. result = subprocess.run(
  34. ['curl', '-s', 'http://100.80.222.54:9020/v1/graphql',
  35. '-X', 'POST',
  36. '-H', 'Content-Type: application/json',
  37. '-d', json.dumps(query)],
  38. capture_output=True,
  39. text=True,
  40. timeout=60
  41. )
  42. if result.returncode != 0:
  43. print(f"❌ CURL viga: {result.stderr}")
  44. break
  45. data = json.loads(result.stdout)
  46. articles = data.get('data', {}).get('Get', {}).get('ScientificArticle', [])
  47. if not articles:
  48. break
  49. all_articles.extend(articles)
  50. print(f" ✅ Tõin {len(articles)} artiklit (kokku: {len(all_articles)})")
  51. offset += limit
  52. except Exception as e:
  53. print(f"❌ Viga: {e}")
  54. break
  55. print(f"✅ LEIDSIN KOKKU {len(all_articles)} ARTIKLIT!")
  56. return all_articles
  57. def extract_transport_context(transport_context):
  58. """Eraldab transport_context JSON-i võtmväljad loetaval kujul"""
  59. if not transport_context:
  60. return "Andmeid pole saadaval"
  61. result_parts = []
  62. try:
  63. # Kui transport_context on string, parsime seda
  64. if isinstance(transport_context, str):
  65. # Eemalda HTML/JSON sildid
  66. text = transport_context
  67. text = re.sub(r'```json\s*', '', text)
  68. text = re.sub(r'\s*```', '', text)
  69. text = text.strip()
  70. # Proovime JSON-i parsida
  71. try:
  72. obj = json.loads(text)
  73. except:
  74. # Kui ei õnnestu, tagastame stringi
  75. return f"```\n{text[:500]}...\n```"
  76. else:
  77. obj = transport_context
  78. # Parsime 'analysis' välja kui see on string
  79. if isinstance(obj, dict) and 'analysis' in obj:
  80. analysis = obj['analysis']
  81. if isinstance(analysis, str):
  82. # Puhastame JSON markerid
  83. analysis = re.sub(r'```json\s*', '', analysis)
  84. analysis = re.sub(r'\s*```', '', analysis)
  85. analysis = analysis.strip()
  86. # Parsime JSON
  87. try:
  88. analysis_obj = json.loads(analysis)
  89. obj = analysis_obj
  90. except:
  91. # Kui ei õnnestu, kasutame regex'i
  92. pass
  93. # Eraldame võtmväljad
  94. if isinstance(obj, dict):
  95. # Teoreetiline panus
  96. if obj.get('theoretical_contribution'):
  97. result_parts.append(
  98. "#### Teoreetiline panus\n\n" +
  99. obj['theoretical_contribution']
  100. )
  101. # Praktiline rakendatavus
  102. if obj.get('practical_applicability'):
  103. result_parts.append(
  104. "#### Praktiline rakendatavus\n\n" +
  105. obj['practical_applicability']
  106. )
  107. # Probleemilahendus
  108. if obj.get('problem_solving'):
  109. result_parts.append(
  110. "#### Probleemilahendus\n\n" +
  111. obj['problem_solving']
  112. )
  113. # Piirangud
  114. if obj.get('limitations'):
  115. result_parts.append(
  116. "#### Piirangud\n\n" +
  117. obj['limitations']
  118. )
  119. # Relevantsuse skoor
  120. score = obj.get('relevance_score')
  121. if score is not None:
  122. result_parts.append(
  123. f"**Relevantsuse skoor:** {score}/10"
  124. )
  125. return "\n\n".join(result_parts) if result_parts else "Andmeid pole saadaval"
  126. except Exception as e:
  127. return f"Viga parsimisega: {str(e)}"
  128. def generate_markdown(articles):
  129. """Genereerib markdown faili artiklitest"""
  130. print("✍️ Genereerin markdown faili...")
  131. # CSS lehevahetuste jaoks
  132. css_header = """<style>
  133. @media print {{
  134. h2 {{
  135. page-break-before: always;
  136. }}
  137. h2:first-of-type {{
  138. page-break-before: avoid;
  139. }}
  140. }}
  141. </style>
  142. # Teadusartiklite analüüs ja transpordiplaneerimise kontekst
  143. Eksporditud: **{timestamp}**
  144. Artikleid kokku: **{count}**
  145. ---
  146. """.format(
  147. timestamp=datetime.now().strftime("%d.%m.%Y %H:%M"),
  148. count=len(articles)
  149. )
  150. # Genereerime artiklite sektsioonid
  151. content = css_header
  152. for i, article in enumerate(articles, 1):
  153. # Artikli pealkiri ja metadata
  154. content += f"\n## {i}. {article['title']}\n\n"
  155. # Allikfail
  156. source = article.get('source_file', 'N/A')
  157. if source:
  158. source_name = source.split('/')[-1] # Võta ainult failinimi
  159. content += f"**Allikfail:** `{source_name}`\n\n"
  160. # Kokkuvõte
  161. summary = article.get('summary_et', '')
  162. if summary:
  163. content += "### Kokkuvõte (eesti keeles)\n\n"
  164. content += summary + "\n\n"
  165. # Transpordiplaneerimise kontekst
  166. transport = article.get('transport_context')
  167. if transport:
  168. content += "### Transpordiplaneerimise kontekst\n\n"
  169. context_text = extract_transport_context(transport)
  170. content += context_text + "\n\n"
  171. content += "---\n"
  172. # Lõppeinfo
  173. content += f"""
  174. ## Lõppinfo
  175. - **Eksporditud:** {datetime.now().strftime("%d.%m.%Y %H:%M:%S")}
  176. - **Kokku artikle:** {len(articles)}
  177. - **Allikas:** Weaviate teadusartiklite andmebaas
  178. Fail konverteeritud Markdown → PDF VS Code Markdown PDF laiendusega.
  179. """
  180. return content
  181. def save_markdown(content, filepath):
  182. """Salvestab markdown failina"""
  183. try:
  184. with open(filepath, 'w', encoding='utf-8') as f:
  185. f.write(content)
  186. print(f"✅ Markdown fail salvestatud: {filepath}")
  187. return True
  188. except Exception as e:
  189. print(f"❌ Viga faili salvestamisel: {e}")
  190. return False
  191. def main():
  192. print("=" * 60)
  193. print("TEADUSARTIKLITE EKSPORT MARKDOWN/PDF FORMAATI")
  194. print("=" * 60)
  195. # 1. Toome artikleid
  196. articles = fetch_articles()
  197. if not articles:
  198. print("❌ Viga: ei saanud artikleid tuua")
  199. return
  200. # 2. Genereerime markdown
  201. markdown = generate_markdown(articles)
  202. # 3. Salvestame
  203. #output_path = "/home/ardo/Downloads/articles_with_transport_context.md"
  204. output_path = "~/rag-demo/transpordi_artiklid/tmp/articles_with_transport_context.md"
  205. if save_markdown(markdown, output_path):
  206. print("\n" + "=" * 60)
  207. print("✅ VALMIS!")
  208. print("=" * 60)
  209. print(f"\n📄 Markdown fail: {output_path}")
  210. print("\n🚀 Järgmised sammud:")
  211. print(" 1. Avage fail VS Code'is")
  212. print(" 2. Paremklõps peal → 'Markdown PDF: Export (pdf)'")
  213. print(" 3. PDF fail luuakse samasse kausta")
  214. print("\n💡 Nipp: Iga artikkel algab uuelt lehelt!")
  215. else:
  216. print("❌ Viga: ei saanud faili salvestada")
  217. if __name__ == "__main__":
  218. main()