utils.py 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. import os
  2. import json
  3. import logging
  4. from datetime import datetime
  5. from typing import Dict, Any, List
  6. from .config import config
  7. def setup_logging():
  8. """Seadista logimine"""
  9. log_file = os.path.join(config.log_dir, f"processing_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log")
  10. logging.basicConfig(
  11. level=logging.INFO,
  12. format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
  13. handlers=[
  14. logging.FileHandler(log_file, encoding='utf-8'),
  15. logging.StreamHandler()
  16. ]
  17. )
  18. return logging.getLogger(__name__)
  19. def save_processed_article(data: Dict[str, Any]):
  20. """Salvesta töödeldud artikkel JSON failina"""
  21. try:
  22. # Genereeri failinimi
  23. timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
  24. filename = f"article_{timestamp}.json"
  25. filepath = os.path.join(config.processed_dir, filename)
  26. # Salvesta JSON failina
  27. with open(filepath, 'w', encoding='utf-8') as f:
  28. json.dump(data, f, ensure_ascii=False, indent=2)
  29. logger = logging.getLogger(__name__)
  30. logger.info(f"Töödeldud andmed salvestatud: {filepath}")
  31. except Exception as e:
  32. logger = logging.getLogger(__name__)
  33. logger.error(f"Viga andmete salvestamisel: {str(e)}")
  34. def load_processed_articles() -> List[Dict]:
  35. """Lae töödeldud artiklid"""
  36. articles = []
  37. for filename in os.listdir(config.processed_dir):
  38. if filename.endswith('.json'):
  39. filepath = os.path.join(config.processed_dir, filename)
  40. try:
  41. with open(filepath, 'r', encoding='utf-8') as f:
  42. articles.append(json.load(f))
  43. except:
  44. continue
  45. return articles
  46. def clean_filename(filename: str) -> str:
  47. """Puhasta failinimi erimärkidest"""
  48. import re
  49. # Eemalda erimärgid, jäta ainult tähed, numbrid, tühikud, punktid ja sidekriipsud
  50. clean = re.sub(r'[^\w\s\.\-]', '', filename)
  51. # Asenda mitmik tühikud ühega
  52. clean = re.sub(r'\s+', ' ', clean)
  53. return clean.strip()