clean_and_normalize.py 2.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. # file: src/clean_and_normalize.py
  2. import os
  3. import json
  4. from pathlib import Path
  5. from dotenv import load_dotenv
  6. import psycopg2
  7. load_dotenv()
  8. EXTRACTED_OUTPUT_DIR = Path(os.getenv("EXTRACTED_OUTPUT_DIR", "data/extracted"))
  9. DB_HOST = os.getenv("DB_HOST", "localhost")
  10. DB_PORT = os.getenv("DB_PORT", "5432")
  11. DB_NAME = os.getenv("DB_NAME", "pdf_research")
  12. DB_USER = os.getenv("DB_USER", "pdf_user")
  13. DB_PASSWORD = os.getenv("DB_PASSWORD")
  14. def get_db_conn():
  15. return psycopg2.connect(
  16. host=DB_HOST,
  17. port=DB_PORT,
  18. database=DB_NAME,
  19. user=DB_USER,
  20. password=DB_PASSWORD,
  21. )
  22. def clean_text_basic(text: str) -> str:
  23. # Väga lihtne puhastus: trimmime, asendame mitmekordsed tühikud ühega
  24. import re
  25. if not text:
  26. return ""
  27. text = text.replace("\r", "\n")
  28. text = text.replace("\u00a0", " ") # non‑breaking space
  29. text = re.sub(r"[ \t]+", " ", text)
  30. text = re.sub(r"\n{3,}", "\n\n", text)
  31. return text.strip()
  32. def process_single_doc(raw_doc_id: int, filename: str):
  33. pdf_stem = Path(filename).stem
  34. json_path = EXTRACTED_OUTPUT_DIR / pdf_stem / "pages.json"
  35. if not json_path.exists():
  36. print(f"❌ pages.json puudub: {json_path}")
  37. return
  38. pages_data = json.loads(json_path.read_text())
  39. conn = get_db_conn()
  40. cur = conn.cursor()
  41. inserted = 0
  42. for page in pages_data:
  43. page_num = page["page"]
  44. raw_text = page.get("text") or ""
  45. text = clean_text_basic(raw_text)
  46. if not text:
  47. print(f" ⚠️ Leht {page_num}: tühi tekst, jätan vahele")
  48. continue
  49. cur.execute("""
  50. INSERT INTO processed_documents
  51. (raw_doc_id, page, section, content_text, has_table, table_data, bbox)
  52. VALUES
  53. (%s, %s, %s, %s, %s, %s, %s)
  54. """, (
  55. raw_doc_id,
  56. page_num,
  57. f"page_{page_num}",
  58. text,
  59. False,
  60. None,
  61. None,
  62. ))
  63. inserted += 1
  64. conn.commit()
  65. cur.close()
  66. conn.close()
  67. print(f"✓ {filename}: lisatud {inserted} rida processed_documents tabelisse")
  68. def process_all_docs():
  69. conn = get_db_conn()
  70. cur = conn.cursor()
  71. # Võtame kõik raw dokumendid
  72. cur.execute("SELECT id, filename FROM raw_documents ORDER BY id")
  73. docs = cur.fetchall()
  74. cur.close()
  75. conn.close()
  76. if not docs:
  77. print("❌ raw_documents on tühi, esmalt käivita extract_pdf.py")
  78. return
  79. for raw_doc_id, filename in docs:
  80. print(f"🔧 Töötlen: {filename} (raw_doc_id={raw_doc_id})")
  81. process_single_doc(raw_doc_id, filename)
  82. print("✓ Kõik dokumendid töödeldud (lehe‑tasemel tekstina).")
  83. if __name__ == "__main__":
  84. process_all_docs()