save_articles_to_pdf.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623
  1. # save_articles_to_pdf.py
  2. import os
  3. import sys
  4. import re
  5. from datetime import datetime
  6. from reportlab.lib.pagesizes import letter, A4
  7. from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle
  8. from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
  9. from reportlab.lib.enums import TA_JUSTIFY, TA_CENTER, TA_LEFT
  10. from reportlab.lib import colors
  11. from reportlab.lib.units import inch, cm
  12. from reportlab.pdfbase import pdfmetrics
  13. from reportlab.pdfbase.ttfonts import TTFont
  14. import json
  15. # Lisa src kaust Pythoni teele
  16. sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
  17. from src.weaviate_client import WeaviateClient
  18. def clean_html_tags(text):
  19. """Puhasta tekst HTML/XML siltidest ReportLab-i jaoks"""
  20. if not text:
  21. return ""
  22. # Eemalda kõik HTML/XML siltid
  23. text = re.sub(r'<[^>]+>', '', text)
  24. # Asenda erimärgid ReportLab-ile sobivate märkidega
  25. replacements = {
  26. '&nbsp;': ' ',
  27. '&amp;': '&',
  28. '&lt;': '<',
  29. '&gt;': '>',
  30. '&quot;': '"',
  31. '&#39;': "'",
  32. '&apos;': "'",
  33. '\u00a0': ' ', # mitte-tühik
  34. '\u2026': '...', # ellipsis
  35. '\u2013': '-', # n-sild
  36. '\u2014': '-', # m-sild
  37. '\u2018': "'", # vasak ülakoma
  38. '\u2019': "'", # parem ülakoma
  39. '\u201c': '"', # vasak jutumärk
  40. '\u201d': '"', # parem jutumärk
  41. }
  42. for old, new in replacements.items():
  43. text = text.replace(old, new)
  44. return text
  45. def clean_markdown_for_pdf(text):
  46. """Konverteeri markdown ReportLab-ile sobivaks tekstiks"""
  47. if not text:
  48. return ""
  49. # Kui ei ole string, konverteeri stringiks
  50. if not isinstance(text, str):
  51. text = str(text)
  52. # Eemalda HTML siltid
  53. text = clean_html_tags(text)
  54. # Asenda markdown pealkirjad
  55. text = re.sub(r'#{1,6}\s+', '', text) # Eemalda # pealkirjad
  56. text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) # Asenda **bold** lihtsalt tekstiga
  57. text = re.sub(r'\*(.+?)\*', r'\1', text) # Asenda *italic* lihtsalt tekstiga
  58. text = re.sub(r'__(.+?)__', r'\1', text) # Asenda __underline__ lihtsalt tekstiga
  59. text = re.sub(r'~~(.+?)~~', r'\1', text) # Asenda ~~strikethrough~~ lihtsalt tekstiga
  60. # Asenda loetelud (PARANDATUD: kasuta \\1 mitte \1)
  61. text = re.sub(r'^\s*[-*+]\s+', '• ', text, flags=re.MULTILINE)
  62. text = re.sub(r'^\s*(\d+)\.\s+', r'\1. ', text, flags=re.MULTILINE) # PARANDATUD
  63. # Asenda koodiblokid
  64. text = re.sub(r'```[^`]+```', '', text) # Eemalda koodiblokid
  65. text = re.sub(r'`([^`]+)`', r'[\1]', text) # Asenda inline kood
  66. # Asenda lingid
  67. text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text) # Eemalda lingid
  68. # Eemalda liigsed tühikud
  69. text = re.sub(r'\s+', ' ', text)
  70. return text.strip()
  71. def clean_json_markers(text):
  72. """
  73. Eemaldab JSON ümber olevad ```json ja ``` markerid
  74. """
  75. # Eemalda algusest
  76. text = re.sub(r'^```json\s*', '', text, flags=re.MULTILINE)
  77. # Eemalda lõpust
  78. text = re.sub(r'\s*```$', '', text, flags=re.MULTILINE)
  79. # Eemalda kõikjalt kui on
  80. text = re.sub(r'\s*```(json)?\s*', '', text)
  81. return text.strip()
  82. def clean_json_string(text):
  83. """Puhasta JSON string"""
  84. # Eemalda reavahetused ja liigsed tühikud
  85. text = text.replace('\n', '').replace('\r', '')
  86. # Asenda \\" tavaliste jutumärkidega
  87. text = text.replace('\\"', '"')
  88. # Eemalda esimesed ja viimased jutumärgid kui vaja
  89. if text.startswith('"') and text.endswith('"'):
  90. text = text[1:-1]
  91. return text
  92. def extract_and_format_json(data):
  93. """Eralda ja vorminda JSON andmed"""
  94. formatted_parts = []
  95. # Võti-väärtus paaride kaart
  96. key_map = {
  97. "theoretical_contribution": "Theoretical contribution",
  98. "practical_applicability": "Practical applicability",
  99. "problem_solving": "Problem solving",
  100. "limitations": "Limitations",
  101. "future_research": "Future research",
  102. "methodology": "Methodology"
  103. }
  104. for key, title in key_map.items():
  105. if key in data and data[key]:
  106. formatted_parts.append(f"{title}")
  107. formatted_parts.append(str(data[key]))
  108. formatted_parts.append("") # tühi rida
  109. return "\n".join(formatted_parts)
  110. def process_json_text(input_text):
  111. """Pööra JSON tekst loetavaks vorminguks"""
  112. try:
  113. # Parse esimene kiht
  114. parsed = json.loads(input_text)
  115. # Otsi analüüsi andmeid
  116. analysis_data = None
  117. # Variant 1: "analysis" väljal on JSON string
  118. if "analysis" in parsed:
  119. try:
  120. # Puhasta ja parse sisemine JSON
  121. clean_analysis = clean_json_string(str(parsed["analysis"]))
  122. analysis_data = json.loads(clean_analysis)
  123. except:
  124. # Kui ei saa JSON-iks, kasuta otse
  125. analysis_data = parsed.get("analysis", {})
  126. # Variant 2: andmed otse pealkirjade all
  127. elif any(key in parsed for key in ["theoretical_contribution", "practical_applicability"]):
  128. analysis_data = parsed
  129. # Variant 3: teised võimalused
  130. else:
  131. # Proovi leida JSON kuskil mujal
  132. for key, value in parsed.items():
  133. if isinstance(value, str) and any(x in value.lower() for x in ["theoretical", "practical", "contribution"]):
  134. try:
  135. clean_val = clean_json_string(value)
  136. analysis_data = json.loads(clean_val)
  137. break
  138. except:
  139. continue
  140. if analysis_data:
  141. return extract_and_format_json(analysis_data)
  142. else:
  143. return "No analysis data found in JSON"
  144. except json.JSONDecodeError as e:
  145. return f"JSON parsing error: {str(e)}"
  146. except Exception as e:
  147. return f"Error: {str(e)}"
  148. def parse_transport_context(context_data):
  149. context_data = clean_json_markers(context_data)
  150. """Parsi transpordikonteksti JSON-ist loetavaks"""
  151. if isinstance(context_data, str):
  152. # Proovi parsida string JSON-iks
  153. try:
  154. return json.loads(context_data)
  155. except json.JSONDecodeError:
  156. # Kui ei õnnestu, tagasta puhastatud string
  157. return {"raw_analysis": clean_markdown_for_pdf(context_data)}
  158. elif isinstance(context_data, dict):
  159. # Puhasta kõik stringiväljad
  160. cleaned = {}
  161. for key, value in context_data.items():
  162. if isinstance(value, str):
  163. cleaned[key] = clean_markdown_for_pdf(value)
  164. else:
  165. cleaned[key] = value
  166. return cleaned
  167. return context_data
  168. def format_context_for_pdf(parsed_context):
  169. """Vorminda parsitud kontekst PDF-i jaoks"""
  170. if isinstance(parsed_context, dict):
  171. formatted = []
  172. if 'theoretical_contribution' in parsed_context and parsed_context['theoretical_contribution']:
  173. formatted.append(f"TEOREETILINE PANUS: {parsed_context['theoretical_contribution']}")
  174. if 'practical_applicability' in parsed_context and parsed_context['practical_applicability']:
  175. formatted.append(f"PRAKTILINE RAKENDATAVUS: {parsed_context['practical_applicability']}")
  176. if 'problem_solving' in parsed_context and parsed_context['problem_solving']:
  177. formatted.append(f"PROBLEEMILAHDUS: {parsed_context['problem_solving']}")
  178. if 'limitations' in parsed_context and parsed_context['limitations']:
  179. formatted.append(f"PIIRANGUD: {parsed_context['limitations']}")
  180. if 'relevance_score' in parsed_context:
  181. formatted.append(f"RELEVANTSUSE SKOOR: {parsed_context['relevance_score']}/10")
  182. if 'analysis' in parsed_context and parsed_context['analysis']:
  183. formatted.append(f"ANALÜÜS: {parsed_context['analysis']}")
  184. if 'raw_analysis' in parsed_context and parsed_context['raw_analysis']:
  185. formatted.append(f"ANALÜÜS: {parsed_context['raw_analysis']}")
  186. return "\n\n".join(formatted)
  187. else:
  188. return clean_markdown_for_pdf(str(parsed_context))
  189. def get_all_articles_from_weaviate():
  190. """Toob kõik artiklid Weaviate'ist"""
  191. client = WeaviateClient()
  192. articles = []
  193. try:
  194. collection = client.client.collections.get("ScientificArticle")
  195. # Loendi kokku
  196. count_response = collection.aggregate.over_all(total_count=True)
  197. total = count_response.total_count
  198. print(f"Weaviate'is leidsin {total} artiklit")
  199. if total > 0:
  200. # Toob kõik artiklid
  201. response = collection.query.fetch_objects(limit=total)
  202. for obj in response.objects:
  203. try:
  204. article = {
  205. 'article_id': obj.properties.get('article_id', 'N/A'),
  206. 'title': clean_markdown_for_pdf(obj.properties.get('title', 'N/A')),
  207. 'authors': obj.properties.get('authors', []),
  208. 'year': obj.properties.get('year', 'N/A'),
  209. 'journal': clean_markdown_for_pdf(obj.properties.get('journal', 'N/A')),
  210. 'doi': obj.properties.get('doi', ''),
  211. 'abstract_en': clean_markdown_for_pdf(obj.properties.get('abstract_en', '')),
  212. 'summary_et': clean_markdown_for_pdf(obj.properties.get('summary_et', '')),
  213. 'key_concepts': [clean_markdown_for_pdf(c) for c in obj.properties.get('key_concepts', [])],
  214. 'methods_used': [clean_markdown_for_pdf(m) for m in obj.properties.get('methods_used', [])],
  215. 'transport_context': parse_transport_context(obj.properties.get('transport_context', {})),
  216. 'relevance_score': obj.properties.get('relevance_score', 'N/A'),
  217. 'processing_date': obj.properties.get('processing_date', ''),
  218. 'source_file': obj.properties.get('source_file', '')
  219. }
  220. articles.append(article)
  221. except Exception as e:
  222. print(f" Viga artikli {obj.properties.get('article_id', 'unknown')} töötlemisel: {e}")
  223. # Lisa artikel ilma puhastuseta
  224. article = {
  225. 'article_id': obj.properties.get('article_id', 'N/A'),
  226. 'title': str(obj.properties.get('title', 'N/A')),
  227. 'authors': obj.properties.get('authors', []),
  228. 'year': obj.properties.get('year', 'N/A'),
  229. 'journal': str(obj.properties.get('journal', 'N/A')),
  230. 'doi': obj.properties.get('doi', ''),
  231. 'abstract_en': str(obj.properties.get('abstract_en', '')),
  232. 'summary_et': str(obj.properties.get('summary_et', '')),
  233. 'key_concepts': [str(c) for c in obj.properties.get('key_concepts', [])],
  234. 'methods_used': [str(m) for m in obj.properties.get('methods_used', [])],
  235. 'transport_context': str(obj.properties.get('transport_context', {})),
  236. 'relevance_score': obj.properties.get('relevance_score', 'N/A'),
  237. 'processing_date': obj.properties.get('processing_date', ''),
  238. 'source_file': obj.properties.get('source_file', '')
  239. }
  240. articles.append(article)
  241. except Exception as e:
  242. print(f"Viga artiklite toomisel: {e}")
  243. import traceback
  244. traceback.print_exc()
  245. finally:
  246. client.close()
  247. return articles
  248. def format_summary_for_pdf(summary):
  249. """Vorminda kokkuvõte PDF-ile sobivaks"""
  250. if not summary:
  251. return ""
  252. # Kui ei ole string, konverteeri
  253. if not isinstance(summary, str):
  254. summary = str(summary)
  255. # Eemalda kõik vormindus ja tee lihtsaks tekstiks
  256. summary = clean_markdown_for_pdf(summary)
  257. # Lisa uued read peamiste sektsioonide ette
  258. summary = summary.replace('1. ARTIKLI PEAMISED PUNKTID:', '\n1. ARTIKLI PEAMISED PUNKTID:\n')
  259. summary = summary.replace('2. KASUTATUD MEETODID:', '\n\n2. KASUTATUD MEETODID:\n')
  260. summary = summary.replace('3. PEAMISED TULEMUSED:', '\n\n3. PEAMISED TULEMUSED:\n')
  261. summary = summary.replace('4. JÄRELDUSED JA SOOVITUSED:', '\n\n4. JÄRELDUSED JA SOOVITUSED:\n')
  262. summary = summary.replace('5. TRANSFORDIPLANEERIMISE KONTEKST:', '\n\n5. TRANSFORDIPLANEERIMISE KONTEKST:\n')
  263. # Asenda liigsed reavahetused
  264. summary = re.sub(r'\n{3,}', '\n\n', summary)
  265. # Lõika liiga pikk tekst
  266. if len(summary) > 4000:
  267. summary = summary[:4000] + "... [kokkuvõte lõigatud, liiga pikk]"
  268. return summary
  269. def create_pdf_from_articles(articles, output_filename):
  270. """Loob PDF faili artiklitest"""
  271. # Loo PDF dokument
  272. doc = SimpleDocTemplate(
  273. output_filename,
  274. pagesize=A4,
  275. rightMargin=72,
  276. leftMargin=72,
  277. topMargin=72,
  278. bottomMargin=72
  279. )
  280. # Stiilide loomine
  281. styles = getSampleStyleSheet()
  282. # Kohandatud stiilid
  283. title_style = ParagraphStyle(
  284. 'CustomTitle',
  285. parent=styles['Heading1'],
  286. fontSize=14,
  287. spaceAfter=12,
  288. textColor=colors.HexColor('#2c3e50'),
  289. alignment=TA_LEFT
  290. )
  291. subtitle_style = ParagraphStyle(
  292. 'CustomSubtitle',
  293. parent=styles['Heading2'],
  294. fontSize=12,
  295. spaceAfter=6,
  296. textColor=colors.HexColor('#34495e'),
  297. alignment=TA_LEFT
  298. )
  299. section_style = ParagraphStyle(
  300. 'CustomSection',
  301. parent=styles['Heading3'],
  302. fontSize=11,
  303. spaceAfter=6,
  304. spaceBefore=12,
  305. textColor=colors.HexColor('#7f8c8d'),
  306. alignment=TA_LEFT
  307. )
  308. normal_style = ParagraphStyle(
  309. 'CustomNormal',
  310. parent=styles['Normal'],
  311. fontSize=10,
  312. spaceAfter=6,
  313. alignment=TA_JUSTIFY,
  314. leading=14 # Reavahe
  315. )
  316. metadata_style = ParagraphStyle(
  317. 'CustomMetadata',
  318. parent=styles['Normal'],
  319. fontSize=9,
  320. spaceAfter=3,
  321. textColor=colors.HexColor('#5d6d7e'),
  322. alignment=TA_LEFT
  323. )
  324. # Elementide kogumine
  325. elements = []
  326. # Pealkiri ja kokkuvõte
  327. elements.append(Paragraph("TEADUSARTIKLITE ANDMEBAAS", title_style))
  328. elements.append(Spacer(1, 12))
  329. today = datetime.now().strftime("%d.%m.%Y %H:%M")
  330. elements.append(Paragraph(f"Eksporditud: {today}", metadata_style))
  331. elements.append(Paragraph(f"Artikleid kokku: {len(articles)}", metadata_style))
  332. elements.append(Spacer(1, 24))
  333. # Iga artikli jaoks
  334. for i, article in enumerate(articles):
  335. # Artikli pealkiri
  336. elements.append(Paragraph(f"{i+1}. {article['title']}", title_style))
  337. # Autorid
  338. if article['authors']:
  339. authors_text = ", ".join(article['authors'])
  340. elements.append(Paragraph(f"<b>Autorid:</b> {authors_text}", subtitle_style))
  341. # Metaandmed tabelina
  342. metadata_data = []
  343. if article['year'] and article['year'] != 'N/A':
  344. metadata_data.append(['Aasta:', str(article['year'])])
  345. if article['journal'] and article['journal'] != 'N/A':
  346. metadata_data.append(['Žurnaal:', article['journal']])
  347. if article['doi']:
  348. metadata_data.append(['DOI:', article['doi']])
  349. if article['relevance_score'] and article['relevance_score'] != 'N/A':
  350. metadata_data.append(['Relevantsus:', f"{article['relevance_score']}/10"])
  351. if metadata_data:
  352. metadata_table = Table(metadata_data, colWidths=[2*cm, 12*cm])
  353. metadata_table.setStyle(TableStyle([
  354. ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
  355. ('FONTSIZE', (0, 0), (-1, -1), 9),
  356. ('BOTTOMPADDING', (0, 0), (-1, -1), 6),
  357. ('TOPPADDING', (0, 0), (-1, -1), 6),
  358. ('VALIGN', (0, 0), (-1, -1), 'TOP'),
  359. ('LEFTPADDING', (0, 0), (0, -1), 0),
  360. ]))
  361. elements.append(metadata_table)
  362. elements.append(Spacer(1, 12))
  363. # Võtmesõnad ja meetodid
  364. tags_data = []
  365. if article['key_concepts']:
  366. concepts_text = ", ".join(article['key_concepts'][:10]) # Piirangu 10 mõistele
  367. tags_data.append(['Võtmesõnad:', concepts_text])
  368. if article['methods_used']:
  369. methods_text = ", ".join(article['methods_used'])
  370. tags_data.append(['Meetodid:', methods_text])
  371. if tags_data:
  372. tags_table = Table(tags_data, colWidths=[2*cm, 12*cm])
  373. tags_table.setStyle(TableStyle([
  374. ('FONTNAME', (0, 0), (-1, -1), 'Helvetica'),
  375. ('FONTSIZE', (0, 0), (-1, -1), 9),
  376. ('BOTTOMPADDING', (0, 0), (-1, -1), 4),
  377. ('TOPPADDING', (0, 0), (-1, -1), 4),
  378. ('VALIGN', (0, 0), (-1, -1), 'TOP'),
  379. ('TEXTCOLOR', (0, 0), (0, -1), colors.HexColor('#5d6d7e')),
  380. ('LEFTPADDING', (0, 0), (0, -1), 0),
  381. ]))
  382. elements.append(tags_table)
  383. elements.append(Spacer(1, 12))
  384. # Abstrakt
  385. if article['abstract_en']:
  386. elements.append(Paragraph("<b>ABSTRAKT (inglise keeles):</b>", section_style))
  387. abstract_text = article['abstract_en']
  388. if len(abstract_text) > 800:
  389. abstract_text = abstract_text[:800] + "..."
  390. elements.append(Paragraph(abstract_text, normal_style))
  391. elements.append(Spacer(1, 12))
  392. # Kokkuvõte
  393. if article['summary_et']:
  394. elements.append(Paragraph("<b>KOKKUVÕTE (eesti keeles):</b>", section_style))
  395. # Formateeri kokkuvõte PDF-ile
  396. summary = format_summary_for_pdf(article['summary_et'])
  397. # Kasuta lihtsat tekstiparagraphi
  398. elements.append(Paragraph(summary, normal_style))
  399. elements.append(Spacer(1, 12))
  400. # Transpordi kontekst
  401. if article['transport_context']:
  402. # Debugimiseks
  403. debugger_data = str(article['transport_context'])
  404. print("----------- \"" + article['title'] + "\" -----------")
  405. print("----------- article['transport_context'] -------------")
  406. print(debugger_data)
  407. elements.append(Paragraph("<b>TRANSFORDIPLANEERIMISE KONTEKST:</b>", section_style))
  408. context_text = format_context_for_pdf(article['transport_context']['relevance_score'])
  409. if context_text:
  410. elements.append(Paragraph("RELEVANTSUSE SKOOR: " + context_text, normal_style))
  411. elements.append(Spacer(1, 1))
  412. elements.append(Paragraph("<b>ANALÜÜS:</b>", normal_style))
  413. analysis_text = article['transport_context']['analysis']
  414. # Proovi leida theoretical_contribution regex'iga
  415. match = re.search(r'"theoretical_contribution":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text)
  416. if match:
  417. context_text = match.group(1)
  418. if context_text:
  419. elements.append(Paragraph("<b>TEOREETILINE PANUS:</b>", normal_style))
  420. elements.append(Paragraph(context_text, normal_style))
  421. # Proovi leida practical_applicability regex'iga
  422. match = re.search(r'"practical_applicability":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text)
  423. if match:
  424. context_text = match.group(1)
  425. if context_text:
  426. elements.append(Paragraph("<b>PRAKTILINE RAKENDATAVUS:</b>", normal_style))
  427. elements.append(Paragraph(context_text, normal_style))
  428. # Proovi leida problem_solving regex'iga
  429. match = re.search(r'"problem_solving":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text)
  430. if match:
  431. context_text = match.group(1)
  432. if context_text:
  433. elements.append(Paragraph("<b>PROBLEEMILAHENDUS:</b>", normal_style))
  434. elements.append(Paragraph(context_text, normal_style))
  435. # Proovi leida limitations regex'iga
  436. match = re.search(r'"limitations":\s*"([^"]*(?:\\"[^"]*)*)"', analysis_text)
  437. if match:
  438. context_text = match.group(1)
  439. if context_text:
  440. elements.append(Paragraph("<b>PIIRANGUD:</b>", normal_style))
  441. elements.append(Paragraph(context_text, normal_style))
  442. # Proovi leida relevance_score regex'iga
  443. match = re.search(r'"relevance_score":\s*(\d+(?:\.\d+)?)', analysis_text)
  444. if match:
  445. context_text = match.group(1)
  446. if context_text:
  447. elements.append(Paragraph("<b>RELEVANTSUSE SKOOR:</b> " + context_text, normal_style))
  448. # Allikfail ja töötlemise info
  449. footer_info = []
  450. if article['source_file']:
  451. source_name = os.path.basename(article['source_file'])
  452. footer_info.append(f"Allikfail: {source_name}")
  453. if article['processing_date']:
  454. # Proovi parsida kuupäeva
  455. try:
  456. # Eemalda mikrosekundid kui on
  457. date_str = article['processing_date']
  458. if '.' in date_str:
  459. date_str = date_str.split('.')[0]
  460. date_str = date_str.replace('Z', '+00:00')
  461. date_obj = datetime.fromisoformat(date_str)
  462. footer_info.append(f"Töödeldud: {date_obj.strftime('%d.%m.%Y %H:%M')}")
  463. except Exception as e:
  464. # Kui ei õnnestu parsida, kuva algne string (lõigatud)
  465. footer_info.append(f"Töödeldud: {article['processing_date'][:19]}")
  466. if footer_info:
  467. elements.append(Spacer(1, 6))
  468. elements.append(Paragraph(" | ".join(footer_info), metadata_style))
  469. # Lisa lehevahetus (välja arvatud viimase artikli puhul)
  470. if i < len(articles) - 1:
  471. elements.append(PageBreak())
  472. else:
  473. elements.append(Spacer(1, 24))
  474. # Lisa lõppinfo
  475. elements.append(Paragraph("=" * 80, metadata_style))
  476. elements.append(Spacer(1, 6))
  477. elements.append(Paragraph(f"Kokku eksporditud artikleid: {len(articles)}", metadata_style))
  478. elements.append(Paragraph("Eksporditud Weaviate teadusartiklite andmebaasist", metadata_style))
  479. elements.append(Paragraph(f"PDF genereeritud: {datetime.now().strftime('%d.%m.%Y %H:%M:%S')}", metadata_style))
  480. # Koosta PDF
  481. doc.build(elements)
  482. return len(articles)
  483. def main():
  484. """Peamine funktsioon"""
  485. print("=" * 60)
  486. print("ARTIKLITE EKSPORT PDF FAILI")
  487. print("=" * 60)
  488. # Toob artiklid Weaviate'ist
  489. print("Toon artikleid Weaviate'ist...")
  490. articles = get_all_articles_from_weaviate()
  491. if not articles:
  492. print("Ei leidnud ühtegi artiklit Weaviate'is!")
  493. return
  494. print(f"Leidsin {len(articles)} artiklit")
  495. # Genereeri PDF failinimi
  496. timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
  497. output_dir = "./data/exports"
  498. os.makedirs(output_dir, exist_ok=True)
  499. output_filename = os.path.join(output_dir, f"artiklid_eksport_{timestamp}.pdf")
  500. # Loo PDF
  501. print(f"Loon PDF faili: {output_filename}")
  502. try:
  503. article_count = create_pdf_from_articles(articles, output_filename)
  504. print("=" * 60)
  505. print(f"✅ VALMIS! Loodud PDF fail: {output_filename}")
  506. print(f" - Eksporditud artikleid: {article_count}")
  507. print(f" - Faili suurus: {os.path.getsize(output_filename) / 1024:.1f} KB")
  508. print("=" * 60)
  509. # Näita esimese artikli pealkirja
  510. if articles:
  511. print("\nEsimesed artiklid:")
  512. for i, article in enumerate(articles[:3]):
  513. title_preview = article['title']
  514. if len(title_preview) > 60:
  515. title_preview = title_preview[:60] + "..."
  516. print(f" {i+1}. {title_preview}")
  517. except Exception as e:
  518. print(f"\n❌ VIGA PDF loomisel: {e}")
  519. import traceback
  520. traceback.print_exc()
  521. if __name__ == "__main__":
  522. main()