save_articles_to_pdf.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696
  1. import os
  2. import sys
  3. import re
  4. from datetime import datetime
  5. from reportlab.lib.pagesizes import A4
  6. from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
  7. from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
  8. from reportlab.lib.enums import TA_JUSTIFY, TA_LEFT
  9. from reportlab.lib import colors
  10. from reportlab.lib.units import cm
  11. import json
  12. import traceback
  13. # Lisa src kaust Pythoni teele
  14. sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
  15. from src.weaviate_client import WeaviateClient
  16. # ============================================================================
  17. # STRINGI PUHASTAMISE FUNKTSIOONID
  18. # ============================================================================
  19. def clean_html_tags(text):
  20. """Puhasta tekst HTML/XML siltidest ReportLab-i jaoks"""
  21. if not text:
  22. return ""
  23. # Eemalda kõik HTML/XML sildid
  24. text = re.sub(r'<[^>]+>', '', text)
  25. # Asenda erimärgid ReportLab-ile sobivate märkidega
  26. replacements = {
  27. ' ': ' ',
  28. '&': '&',
  29. '<': '<',
  30. '>': '>',
  31. '"': '"',
  32. '’': "'",
  33. '‘': "'",
  34. '\u00a0': ' ', # mitte-tühik
  35. '\u2026': '...', # ellipsis
  36. '\u2013': '-', # n-sild
  37. '\u2014': '-', # m-sild
  38. '\u2018': "'", # vasak ülakoma
  39. '\u2019': "'", # parem ülakoma
  40. '\u201c': '"', # vasak jutumärk
  41. '\u201d': '"', # parem jutumärk
  42. }
  43. for old, new in replacements.items():
  44. text = text.replace(old, new)
  45. return text
  46. def clean_markdown_for_pdf(text):
  47. """Konverteeri markdown ReportLab-ile sobivaks tekstiks"""
  48. if not text:
  49. return ""
  50. # Kui ei ole string, konverteeri stringiks
  51. if not isinstance(text, str):
  52. text = str(text)
  53. # Eemalda HTML sildid
  54. text = clean_html_tags(text)
  55. # Asenda markdown pealkirjad (# # # jne)
  56. text = re.sub(r'#{1,6}\s+', '', text)
  57. # Asenda bold, italic, strikethrough markeeringud
  58. text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # **bold**
  59. text = re.sub(r'\*(.+?)\*', r'\1', text) # *italic*
  60. text = re.sub(r'__(.+?)__', r'\1', text) # __underline__
  61. text = re.sub(r'~~(.+?)~~', r'\1', text) # ~~strikethrough~~
  62. # Asenda loetelud
  63. text = re.sub(r'^\s*[-*+]\s+', '• ', text, flags=re.MULTILINE)
  64. text = re.sub(r'^\s*(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE)
  65. # Eemalda koodiblokid
  66. text = re.sub(r'```[^`]+```', '', text)
  67. text = re.sub(r'`([^`]+)`', r'[\1]', text)
  68. # Eemalda lingid (jäta ainult tekst)
  69. text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
  70. # Eemalda liigsed tühikud
  71. text = re.sub(r'\s+', ' ', text)
  72. return text.strip()
  73. def format_summary_for_pdf(summary):
  74. """Vorminda kokkuvõte PDF-ile sobivaks"""
  75. if not summary:
  76. return ""
  77. # Kui ei ole string, konverteeri
  78. if not isinstance(summary, str):
  79. summary = str(summary)
  80. # Eemalda kõik vormindus
  81. summary = clean_markdown_for_pdf(summary)
  82. # Lisa uued read peamiste sektsioonide ette
  83. summary = summary.replace('1. ARTIKLI PEAMISED PUNKTID:', '\n1. ARTIKLI PEAMISED PUNKTID:\n')
  84. summary = summary.replace('2. KASUTATUD MEETODID:', '\n\n2. KASUTATUD MEETODID:\n')
  85. summary = summary.replace('3. PEAMISED TULEMUSED:', '\n\n3. PEAMISED TULEMUSED:\n')
  86. summary = summary.replace('4. JÄRELDUSED JA SOOVITUSED:', '\n\n4. JÄRELDUSED JA SOOVITUSED:\n')
  87. summary = summary.replace('5. TRANSFORDIPLANEERIMISE KONTEKST:', '\n\n5. TRANSFORDIPLANEERIMISE KONTEKST:\n')
  88. # Asenda liigsed reavahetused
  89. summary = re.sub(r'\n{3,}', '\n\n', summary)
  90. # Lõika liiga pikk tekst
  91. if len(summary) > 4000:
  92. summary = summary[:4000] + "... [kokkuvõte lõigatud, liiga pikk]"
  93. return summary
  94. # ============================================================================
  95. # TRANSPORT KONTEKSTI PARSING
  96. # ============================================================================
  97. def extract_json_field(json_string, field_name):
  98. """
  99. Eralda JSON stringist konkreetne väli regex abil.
  100. Args:
  101. json_string: JSON tekst stringina
  102. field_name: välja nimi (nt "theoretical_contribution")
  103. Returns:
  104. Välja väärtus või None
  105. """
  106. if not json_string:
  107. return None
  108. pattern = rf'"{field_name}":\s*"([^"]*(?:\\"[^"]*)*)"'
  109. match = re.search(pattern, json_string)
  110. if match:
  111. return match.group(1)
  112. return None
  113. def extract_relevance_score(json_string):
  114. """Eralda relevance_score JSON stringist või tekstist."""
  115. if not json_string:
  116. return None
  117. pattern = r'"relevance_score":\s*(\d+(?:\.\d+)?)'
  118. match = re.search(pattern, json_string)
  119. if match:
  120. return match.group(1)
  121. return None
  122. def format_transport_context(transport_context):
  123. """
  124. Vorminda transpordi kontekst PDF-ile.
  125. """
  126. if not transport_context:
  127. return None
  128. # PARANDUS: Leia analüüsi tekst
  129. analysis_text = None
  130. # Kui on dict ja sisaldab 'analysis' võtit
  131. if isinstance(transport_context, dict):
  132. if 'analysis' in transport_context:
  133. analysis_text = transport_context['analysis']
  134. else:
  135. # Võib-olla kogu dict ON juba analysis?
  136. analysis_text = str(transport_context)
  137. elif isinstance(transport_context, str):
  138. analysis_text = transport_context
  139. else:
  140. return None
  141. if not analysis_text:
  142. return None
  143. txt = str(analysis_text)
  144. # EEMALDA ```json ... ``` markerid
  145. txt = re.sub(r'```json\s*', '', txt)
  146. txt = re.sub(r'\s*```', '', txt)
  147. txt = txt.strip()
  148. # DEBUG: Prindi välja pärast puhastamist
  149. #print("DEBUG after cleanup:", txt[:200])
  150. parsed = None
  151. # 1) Proovi parsida JSON otse
  152. try:
  153. parsed = json.loads(txt)
  154. #print("✅ JSON parsed successfully!")
  155. #print(f"DEBUG parsed keys: {list(parsed.keys())}")
  156. except json.JSONDecodeError as e:
  157. #print(f"❌ JSON parse failed: {e}")
  158. # Kui ei õnnestu, proovi leida {...} blokk
  159. m = re.search(r'\{.*\}', txt, flags=re.DOTALL)
  160. if m:
  161. json_candidate = m.group(0)
  162. try:
  163. parsed = json.loads(json_candidate)
  164. #print("✅ JSON parsed from candidate!")
  165. except json.JSONDecodeError as e2:
  166. print(f"❌ Candidate parse also failed: {e2}")
  167. parsed = None
  168. formatted_parts = []
  169. # Kui JSON parsimine õnnestus
  170. if isinstance(parsed, dict):
  171. #print("✅ Using parsed JSON dict")
  172. # KUI parsed sisaldab 'analysis' võtit, siis see on VEEL ÜKS string!
  173. # Peame seda UUESTI parsima!
  174. # KUI parsed sisaldab 'analysis' võtit, siis see on VEEL ÜKS string!
  175. if 'analysis' in parsed and isinstance(parsed['analysis'], str):
  176. #print("⚠️ 'analysis' is still a string, parsing again...")
  177. inner_txt = parsed['analysis']
  178. # Eemalda ```json markerid uuesti
  179. inner_txt = re.sub(r'```json\s*', '', inner_txt)
  180. inner_txt = re.sub(r'\s*```', '', inner_txt)
  181. inner_txt = inner_txt.strip()
  182. # PARANDUS: Asenda valed escaped quotes
  183. # \"word\" → "word" (ainult siis kui on tähtede vahel)
  184. inner_txt = re.sub(r'\\"([a-züõäöA-ZÜÕÄÖ]+)\\"', r'"\1"', inner_txt)
  185. # PARANDUS: Escape jutumärgid, mis on stringi väärtuste sees
  186. # Leia kõik "key": "value" paare ja escape "value" sees olevad jutumärgid
  187. def escape_quotes_in_values(match):
  188. key = match.group(1)
  189. value = match.group(2)
  190. # Escape jutumärgid value sees
  191. value_escaped = value.replace('"', '\\"')
  192. return f'"{key}": "{value_escaped}"'
  193. inner_txt = re.sub(r'"(\w+)":\s*"([^"]*(?:"[^"]*)*)"', escape_quotes_in_values, inner_txt)
  194. try:
  195. parsed = json.loads(inner_txt)
  196. #print("✅ Inner JSON parsed successfully!")
  197. #print(f"DEBUG inner parsed keys: {list(parsed.keys())}")
  198. except json.JSONDecodeError as e:
  199. #print(f"❌ Inner JSON parse failed: {e}")
  200. # AGRESSIIVNE PARANDUS: kasuta regex fallback'i
  201. #print("⚠️ Falling back to regex extraction...")
  202. # Taasta originaal inner_txt (ilma escapimiseta)
  203. inner_txt = parsed['analysis']
  204. inner_txt = re.sub(r'```json\s*', '', inner_txt)
  205. inner_txt = re.sub(r'\s*```', '', inner_txt)
  206. inner_txt = inner_txt.strip()
  207. # Kasuta regex'i otse inner_txt pealt
  208. temp_parts = []
  209. match = re.search(r'"theoretical_contribution":\s*"(.*?)"(?=\s*,\s*")', inner_txt, flags=re.DOTALL)
  210. if match:
  211. temp_parts.append("TEOREETILINE PANUS:\n" + match.group(1))
  212. match = re.search(r'"practical_applicability":\s*"(.*?)"(?=\s*,\s*")', inner_txt, flags=re.DOTALL)
  213. if match:
  214. temp_parts.append("PRAKTILINE RAKENDATAVUS:\n" + match.group(1))
  215. match = re.search(r'"problem_solving":\s*"(.*?)"(?=\s*,\s*")', inner_txt, flags=re.DOTALL)
  216. if match:
  217. temp_parts.append("PROBLEEMILAHENDUS:\n" + match.group(1))
  218. match = re.search(r'"limitations":\s*"(.*?)"(?=\s*,\s*")', inner_txt, flags=re.DOTALL)
  219. if match:
  220. temp_parts.append("PIIRANGUD:\n" + match.group(1))
  221. match = re.search(r'"relevance_score":\s*(\d+)', inner_txt)
  222. if match:
  223. temp_parts.append(f"RELEVANTSUSE SKOOR: {match.group(1)}/10")
  224. if temp_parts:
  225. #print(f"✅ Regex extracted {len(temp_parts)} parts")
  226. return "\n\n".join(temp_parts)
  227. # Nüüd kasuta parsed dict'i
  228. if parsed.get("theoretical_contribution"):
  229. formatted_parts.append(
  230. "TEOREETILINE PANUS:\n" + str(parsed["theoretical_contribution"])
  231. )
  232. if parsed.get("practical_applicability"):
  233. formatted_parts.append(
  234. "PRAKTILINE RAKENDATAVUS:\n" + str(parsed["practical_applicability"])
  235. )
  236. if parsed.get("problem_solving"):
  237. formatted_parts.append(
  238. "PROBLEEMILAHENDUS:\n" + str(parsed["problem_solving"])
  239. )
  240. if parsed.get("limitations"):
  241. formatted_parts.append(
  242. "PIIRANGUD:\n" + str(parsed["limitations"])
  243. )
  244. if "relevance_score" in parsed:
  245. formatted_parts.append(
  246. f"RELEVANTSUSE SKOOR: {parsed['relevance_score']}/10"
  247. )
  248. return "\n\n".join(formatted_parts) if formatted_parts else None
  249. # Kui JSON ei õnnestunud → kasuta regex-i
  250. print("⚠️ Using regex fallback")
  251. # Regex peab nüüd käsitlema newline't – kasuta re.DOTALL
  252. match = re.search(r'"theoretical_contribution":\s*"(.*?)"(?=\s*,|\s*})', txt, flags=re.DOTALL)
  253. if match:
  254. content = match.group(1).replace('\\n', '\n').replace('\\"', '"')
  255. if content:
  256. formatted_parts.append(f"TEOREETILINE PANUS:\n{content}")
  257. match = re.search(r'"practical_applicability":\s*"(.*?)"(?=\s*,|\s*})', txt, flags=re.DOTALL)
  258. if match:
  259. content = match.group(1).replace('\\n', '\n').replace('\\"', '"')
  260. if content:
  261. formatted_parts.append(f"PRAKTILINE RAKENDATAVUS:\n{content}")
  262. match = re.search(r'"problem_solving":\s*"(.*?)"(?=\s*,|\s*})', txt, flags=re.DOTALL)
  263. if match:
  264. content = match.group(1).replace('\\n', '\n').replace('\\"', '"')
  265. if content:
  266. formatted_parts.append(f"PROBLEEMILAHENDUS:\n{content}")
  267. match = re.search(r'"limitations":\s*"(.*?)"(?=\s*,|\s*})', txt, flags=re.DOTALL)
  268. if match:
  269. content = match.group(1).replace('\\n', '\n').replace('\\"', '"')
  270. if content:
  271. formatted_parts.append(f"PIIRANGUD:\n{content}")
  272. match = re.search(r'"relevance_score":\s*(\d+(?:\.\d+)?)', txt)
  273. if match:
  274. score = match.group(1)
  275. formatted_parts.append(f"RELEVANTSUSE SKOOR: {score}/10")
  276. return "\n\n".join(formatted_parts) if formatted_parts else None
  277. # ============================================================================
  278. # WEAVIATE ANDMEBAASIST PÄRING
  279. # ============================================================================
  280. def get_all_articles_from_weaviate():
  281. """Toob kõik artiklid Weaviate andmebaasist"""
  282. client = WeaviateClient()
  283. articles = []
  284. try:
  285. collection = client.client.collections.get("ScientificArticle")
  286. # Loendi kokku
  287. count_response = collection.aggregate.over_all(total_count=True)
  288. total = count_response.total_count
  289. print(f"Weaviate'is leidsin {total} artiklit")
  290. if total > 0:
  291. # Toob kõik artiklid
  292. response = collection.query.fetch_objects(limit=total)
  293. for obj in response.objects:
  294. try:
  295. article = {
  296. 'article_id': obj.properties.get('article_id', 'N/A'),
  297. 'title': clean_markdown_for_pdf(obj.properties.get('title', 'N/A')),
  298. 'authors': obj.properties.get('authors', []),
  299. 'year': obj.properties.get('year', 'N/A'),
  300. 'journal': clean_markdown_for_pdf(obj.properties.get('journal', 'N/A')),
  301. 'doi': obj.properties.get('doi', ''),
  302. 'abstract_en': clean_markdown_for_pdf(obj.properties.get('abstract_en', '')),
  303. 'summary_et': clean_markdown_for_pdf(obj.properties.get('summary_et', '')),
  304. 'key_concepts': [clean_markdown_for_pdf(c) for c in obj.properties.get('key_concepts', [])],
  305. 'methods_used': [clean_markdown_for_pdf(m) for m in obj.properties.get('methods_used', [])],
  306. 'transport_context': obj.properties.get('transport_context', {}),
  307. 'relevance_score': obj.properties.get('relevance_score', 'N/A'),
  308. 'processing_date': obj.properties.get('processing_date', ''),
  309. 'source_file': obj.properties.get('source_file', '')
  310. }
  311. articles.append(article)
  312. except Exception as e:
  313. print(f"⚠️ Viga artikli {obj.properties.get('article_id', 'unknown')} töötlemisel: {e}")
  314. # Jätka järgmise artikliga
  315. continue
  316. except Exception as e:
  317. print(f"❌ Viga artiklite toomisel: {e}")
  318. traceback.print_exc()
  319. finally:
  320. client.close()
  321. return articles
  322. # ============================================================================
  323. # PDF GENEREERIMINE
  324. # ============================================================================
  325. def create_pdf_from_articles(articles, output_filename):
  326. """Loob PDF faili artiklitest"""
  327. # Loo PDF dokument
  328. doc = SimpleDocTemplate(
  329. output_filename,
  330. pagesize=A4,
  331. rightMargin=72,
  332. leftMargin=72,
  333. topMargin=72,
  334. bottomMargin=72
  335. )
  336. # Stiilide loomine
  337. styles = getSampleStyleSheet()
  338. # Kohandatud stiilid
  339. title_style = ParagraphStyle(
  340. 'CustomTitle',
  341. parent=styles['Heading1'],
  342. fontSize=14,
  343. spaceAfter=12,
  344. textColor=colors.HexColor('#2c3e50'),
  345. alignment=TA_LEFT
  346. )
  347. subtitle_style = ParagraphStyle(
  348. 'CustomSubtitle',
  349. parent=styles['Heading2'],
  350. fontSize=12,
  351. spaceAfter=6,
  352. textColor=colors.HexColor('#34495e'),
  353. alignment=TA_LEFT
  354. )
  355. section_style = ParagraphStyle(
  356. 'CustomSection',
  357. parent=styles['Heading3'],
  358. fontSize=11,
  359. spaceAfter=6,
  360. spaceBefore=12,
  361. textColor=colors.HexColor('#7f8c8d'),
  362. alignment=TA_LEFT
  363. )
  364. normal_style = ParagraphStyle(
  365. 'CustomNormal',
  366. parent=styles['Normal'],
  367. fontSize=10,
  368. spaceAfter=6,
  369. alignment=TA_JUSTIFY,
  370. leading=14
  371. )
  372. metadata_style = ParagraphStyle(
  373. 'CustomMetadata',
  374. parent=styles['Normal'],
  375. fontSize=9,
  376. spaceAfter=3,
  377. textColor=colors.HexColor('#5d6d7e'),
  378. alignment=TA_LEFT
  379. )
  380. # Elementide kogumine
  381. elements = []
  382. # Pealkiri ja kokkuvõte
  383. elements.append(Paragraph("TEADUSARTIKLITE ANDMEBAAS", title_style))
  384. elements.append(Spacer(1, 12))
  385. today = datetime.now().strftime("%d.%m.%Y %H:%M")
  386. elements.append(Paragraph(f"Eksporditud: {today}", metadata_style))
  387. elements.append(Paragraph(f"Artikleid kokku: {len(articles)}", metadata_style))
  388. elements.append(Spacer(1, 24))
  389. # ========================================================================
  390. # Iga artikli jaoks
  391. # ========================================================================
  392. for i, article in enumerate(articles):
  393. # ARTIKLI PEALKIRI
  394. elements.append(Paragraph(f"{i+1}. {article['title']}", title_style))
  395. print(f"✅ {i+1}. {article['title']}")
  396. # AUTORID
  397. if article['authors']:
  398. authors_text = ", ".join(article['authors'])
  399. elements.append(Paragraph(f"Autorid: {authors_text}", subtitle_style))
  400. # METAANDMED (aasta, žurnaal, DOI, relevantsus)
  401. metadata_data = []
  402. if article['year'] and article['year'] != 'N/A':
  403. metadata_data.append(['Aasta:', str(article['year'])])
  404. if article['journal'] and article['journal'] != 'N/A':
  405. metadata_data.append(['Žurnaal:', article['journal']])
  406. if article['doi']:
  407. metadata_data.append(['DOI:', article['doi']])
  408. if article['relevance_score'] and article['relevance_score'] != 'N/A':
  409. metadata_data.append(['Relevantsus:', f"{article['relevance_score']}/10"])
  410. if metadata_data:
  411. metadata_table = Table(metadata_data, colWidths=[2*cm, 12*cm])
  412. metadata_table.setStyle(TableStyle([
  413. ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
  414. ('FONTSIZE', (0, 0), (-1, -1), 9),
  415. ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
  416. ('TOPPADDING', (0, 0), (-1, -1), 6),
  417. ('VALIGN', (0, 0), (-1, -1), 'TOP'),
  418. ('LEFTPADDING', (0, 0), (0, -1), 0),
  419. ]))
  420. elements.append(metadata_table)
  421. elements.append(Spacer(1, 12))
  422. # VÕTMESÕNAD JA MEETODID
  423. tags_data = []
  424. if article['key_concepts']:
  425. concepts_text = ", ".join(article['key_concepts'][:10]) # Piirang: 10 mõiste
  426. tags_data.append(['Võtmesõnad:', concepts_text])
  427. if article['methods_used']:
  428. methods_text = ", ".join(article['methods_used'])
  429. tags_data.append(['Meetodid:', methods_text])
  430. if tags_data:
  431. tags_table = Table(tags_data, colWidths=[2*cm, 12*cm])
  432. tags_table.setStyle(TableStyle([
  433. ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
  434. ('FONTSIZE', (0, 0), (-1, -1), 9),
  435. ('BOTTOMPADDING', (0, 0), (-1, -1), 4),
  436. ('TOPPADDING', (0, 0), (-1, -1), 4),
  437. ('VALIGN', (0, 0), (-1, -1), 'TOP'),
  438. ('TEXTCOLOR', (0, 0), (0, -1), colors.HexColor('#5d6d7e')),
  439. ('LEFTPADDING', (0, 0), (0, -1), 0),
  440. ]))
  441. elements.append(tags_table)
  442. elements.append(Spacer(1, 12))
  443. # ABSTRAKT (inglise keeles)
  444. if article['abstract_en']:
  445. elements.append(Paragraph("ABSTRAKT (inglise keeles):", section_style))
  446. abstract_text = article['abstract_en']
  447. if len(abstract_text) > 800:
  448. abstract_text = abstract_text[:800] + "..."
  449. elements.append(Paragraph(abstract_text, normal_style))
  450. elements.append(Spacer(1, 12))
  451. # KOKKUVÕTE (eesti keeles)
  452. if article['summary_et']:
  453. elements.append(Paragraph("KOKKUVÕTE (eesti keeles):", section_style))
  454. summary = format_summary_for_pdf(article['summary_et'])
  455. elements.append(Paragraph(summary, normal_style))
  456. elements.append(Spacer(1, 12))
  457. # ====================================================================
  458. # TRANSPORDI PLANEERIMISE KONTEKST
  459. # ====================================================================
  460. if article['transport_context']:
  461. elements.append(Paragraph("TRANSFORDIPLANEERIMISE KONTEKST:", section_style))
  462. # DEBUG-REA – prindi üks-ühele objekt konsooli
  463. #print("DEBUG transport_context:", article['article_id'], article['transport_context'])
  464. context_text = format_transport_context(article['transport_context'])
  465. #print(f"DEBUG context_text returned: {context_text}") # <-- LISA SEE RIDA
  466. if context_text:
  467. # Jaga osadeks ja lisa eraldi paragrahfidena
  468. parts = context_text.split('\n\n') # Jaga tühjast reaga
  469. for part in parts:
  470. if part.strip():
  471. # Asenda \n <br/> tag'iga
  472. part_html = part.replace('\n', '<br/>')
  473. try:
  474. elements.append(Paragraph(part_html, normal_style))
  475. elements.append(Spacer(1, 6))
  476. except Exception as e:
  477. print(f"❌ Failed to add part to PDF: {e}")
  478. # Kui HTML tag ei tööta, proovi ilma
  479. part_plain = part.replace('\n', ' ')
  480. elements.append(Paragraph(part_plain, normal_style))
  481. elements.append(Spacer(1, 6))
  482. print("✅ Context added to PDF successfully")
  483. else:
  484. elements.append(Paragraph("Analüüsi andmed puuduvad", normal_style))
  485. elements.append(Spacer(1, 12))
  486. # ====================================================================
  487. # FOOTER INFO (allikfail, töötlemise kuupäev)
  488. # ====================================================================
  489. footer_info = []
  490. if article['source_file']:
  491. source_name = os.path.basename(article['source_file'])
  492. footer_info.append(f"Allikfail: {source_name}")
  493. if article['processing_date']:
  494. try:
  495. date_str = article['processing_date']
  496. if '.' in date_str:
  497. date_str = date_str.split('.')[0]
  498. date_str = date_str.replace('Z', '+00:00')
  499. date_obj = datetime.fromisoformat(date_str)
  500. footer_info.append(f"Töödeldud: {date_obj.strftime('%d.%m.%Y %H:%M')}")
  501. except Exception:
  502. footer_info.append(f"Töödeldud: {article['processing_date'][:19]}")
  503. if footer_info:
  504. elements.append(Spacer(1, 6))
  505. elements.append(Paragraph(" | ".join(footer_info), metadata_style))
  506. # Lisa lehevahetus (välja arvatud viimase artikli puhul)
  507. if i < len(articles) - 1:
  508. elements.append(PageBreak())
  509. else:
  510. elements.append(Spacer(1, 24))
  511. # LÕPPINFO
  512. elements.append(Paragraph("=" * 80, metadata_style))
  513. elements.append(Spacer(1, 6))
  514. elements.append(Paragraph(f"Kokku eksporditud artikleid: {len(articles)}", metadata_style))
  515. elements.append(Paragraph("Eksporditud Weaviate teadusartiklite andmebaasist", metadata_style))
  516. elements.append(Paragraph(f"PDF genereeritud: {datetime.now().strftime('%d.%m.%Y %H:%M:%S')}", metadata_style))
  517. # Koosta PDF
  518. doc.build(elements)
  519. return len(articles)
  520. # ============================================================================
  521. # PEAMINE FUNKTSIOON
  522. # ============================================================================
  523. def main():
  524. """Peamine funktsioon - käivitab kogu protsessi"""
  525. print("=" * 60)
  526. print("ARTIKLITE EKSPORT PDF FAILI")
  527. print("=" * 60)
  528. # Toob artiklid Weaviate'ist
  529. print("Toon artikleid Weaviate'ist...")
  530. articles = get_all_articles_from_weaviate()
  531. if not articles:
  532. print("❌ Ei leidnud ühtegi artiklit Weaviate'is!")
  533. return
  534. print(f"✓ Leidsin {len(articles)} artiklit")
  535. # Genereeri PDF failinimi
  536. timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  537. output_dir = "./data/exports"
  538. os.makedirs(output_dir, exist_ok=True)
  539. output_filename = os.path.join(output_dir, f"artiklid_eksport_{timestamp}.pdf")
  540. # Loo PDF
  541. print(f"Loon PDF faili: {output_filename}")
  542. try:
  543. article_count = create_pdf_from_articles(articles, output_filename)
  544. print("=" * 60)
  545. print(f"✅ VALMIS! Loodud PDF fail: {output_filename}")
  546. print(f" - Eksporditud artikleid: {article_count}")
  547. print(f" - Faili suurus: {os.path.getsize(output_filename) / 1024:.1f} KB")
  548. print("=" * 60)
  549. if articles:
  550. print("\nEsimesed artiklid:")
  551. for i, article in enumerate(articles[:3]):
  552. title_preview = article['title']
  553. if len(title_preview) > 60:
  554. title_preview = title_preview[:60] + "..."
  555. print(f" {i+1}. {title_preview}")
  556. except Exception as e:
  557. print(f"\n❌ VIGA PDF loomisel: {e}")
  558. traceback.print_exc()
  559. if __name__ == "__main__":
  560. main()