| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- # file: src/clean_and_normalize.py
- import os
- import json
- from pathlib import Path
- from dotenv import load_dotenv
- import psycopg2
- load_dotenv()
- EXTRACTED_OUTPUT_DIR = Path(os.getenv("EXTRACTED_OUTPUT_DIR", "data/extracted"))
- DB_HOST = os.getenv("DB_HOST", "localhost")
- DB_PORT = os.getenv("DB_PORT", "5432")
- DB_NAME = os.getenv("DB_NAME", "pdf_research")
- DB_USER = os.getenv("DB_USER", "pdf_user")
- DB_PASSWORD = os.getenv("DB_PASSWORD")
- def get_db_conn():
- return psycopg2.connect(
- host=DB_HOST,
- port=DB_PORT,
- database=DB_NAME,
- user=DB_USER,
- password=DB_PASSWORD,
- )
- def clean_text_basic(text: str) -> str:
- # Väga lihtne puhastus: trimmime, asendame mitmekordsed tühikud ühega
- import re
- if not text:
- return ""
- text = text.replace("\r", "\n")
- text = text.replace("\u00a0", " ") # non‑breaking space
- text = re.sub(r"[ \t]+", " ", text)
- text = re.sub(r"\n{3,}", "\n\n", text)
- return text.strip()
- def process_single_doc(raw_doc_id: int, filename: str):
- pdf_stem = Path(filename).stem
- json_path = EXTRACTED_OUTPUT_DIR / pdf_stem / "pages.json"
- if not json_path.exists():
- print(f"❌ pages.json puudub: {json_path}")
- return
- pages_data = json.loads(json_path.read_text())
- conn = get_db_conn()
- cur = conn.cursor()
- inserted = 0
- for page in pages_data:
- page_num = page["page"]
- raw_text = page.get("text") or ""
- text = clean_text_basic(raw_text)
- if not text:
- print(f" ⚠️ Leht {page_num}: tühi tekst, jätan vahele")
- continue
- cur.execute("""
- INSERT INTO processed_documents
- (raw_doc_id, page, section, content_text, has_table, table_data, bbox)
- VALUES
- (%s, %s, %s, %s, %s, %s, %s)
- """, (
- raw_doc_id,
- page_num,
- f"page_{page_num}",
- text,
- False,
- None,
- None,
- ))
- inserted += 1
- conn.commit()
- cur.close()
- conn.close()
- print(f"✓ {filename}: lisatud {inserted} rida processed_documents tabelisse")
- def process_all_docs():
- conn = get_db_conn()
- cur = conn.cursor()
- # Võtame kõik raw dokumendid
- cur.execute("SELECT id, filename FROM raw_documents ORDER BY id")
- docs = cur.fetchall()
- cur.close()
- conn.close()
- if not docs:
- print("❌ raw_documents on tühi, esmalt käivita extract_pdf.py")
- return
- for raw_doc_id, filename in docs:
- print(f"🔧 Töötlen: {filename} (raw_doc_id={raw_doc_id})")
- process_single_doc(raw_doc_id, filename)
- print("✓ Kõik dokumendid töödeldud (lehe‑tasemel tekstina).")
- if __name__ == "__main__":
- process_all_docs()
|