# file: src/clean_and_normalize.py import os import json from pathlib import Path from dotenv import load_dotenv import psycopg2 load_dotenv() EXTRACTED_OUTPUT_DIR = Path(os.getenv("EXTRACTED_OUTPUT_DIR", "data/extracted")) DB_HOST = os.getenv("DB_HOST", "localhost") DB_PORT = os.getenv("DB_PORT", "5432") DB_NAME = os.getenv("DB_NAME", "pdf_research") DB_USER = os.getenv("DB_USER", "pdf_user") DB_PASSWORD = os.getenv("DB_PASSWORD") def get_db_conn(): return psycopg2.connect( host=DB_HOST, port=DB_PORT, database=DB_NAME, user=DB_USER, password=DB_PASSWORD, ) def clean_text_basic(text: str) -> str: # Väga lihtne puhastus: trimmime, asendame mitmekordsed tühikud ühega import re if not text: return "" text = text.replace("\r", "\n") text = text.replace("\u00a0", " ") # non‑breaking space text = re.sub(r"[ \t]+", " ", text) text = re.sub(r"\n{3,}", "\n\n", text) return text.strip() def process_single_doc(raw_doc_id: int, filename: str): pdf_stem = Path(filename).stem json_path = EXTRACTED_OUTPUT_DIR / pdf_stem / "pages.json" if not json_path.exists(): print(f"❌ pages.json puudub: {json_path}") return pages_data = json.loads(json_path.read_text()) conn = get_db_conn() cur = conn.cursor() inserted = 0 for page in pages_data: page_num = page["page"] raw_text = page.get("text") or "" text = clean_text_basic(raw_text) if not text: print(f" ⚠️ Leht {page_num}: tühi tekst, jätan vahele") continue cur.execute(""" INSERT INTO processed_documents (raw_doc_id, page, section, content_text, has_table, table_data, bbox) VALUES (%s, %s, %s, %s, %s, %s, %s) """, ( raw_doc_id, page_num, f"page_{page_num}", text, False, None, None, )) inserted += 1 conn.commit() cur.close() conn.close() print(f"✓ {filename}: lisatud {inserted} rida processed_documents tabelisse") def process_all_docs(): conn = get_db_conn() cur = conn.cursor() # Võtame kõik raw dokumendid cur.execute("SELECT id, filename FROM raw_documents ORDER BY id") docs = cur.fetchall() cur.close() conn.close() if not docs: print("❌ raw_documents on tühi, esmalt käivita extract_pdf.py") return for raw_doc_id, filename in docs: print(f"🔧 Töötlen: {filename} (raw_doc_id={raw_doc_id})") process_single_doc(raw_doc_id, filename) print("✓ Kõik dokumendid töödeldud (lehe‑tasemel tekstina).") if __name__ == "__main__": process_all_docs()