weaviate_client.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. import weaviate
  2. from weaviate.classes.config import DataType, Property, Configure, VectorDistances
  3. from weaviate.classes.query import Filter
  4. import hashlib
  5. import json
  6. from typing import Dict, List, Optional, Any
  7. import logging
  8. from urllib.parse import urlparse
  9. from .config import config
  10. logger = logging.getLogger(__name__)
  11. class WeaviateClient:
  12. """Weaviate kliendi klass versioon 4.19.0"""
  13. def __init__(self):
  14. self.logger = logging.getLogger(__name__)
  15. self.client = self._connect_to_weaviate()
  16. self.class_name = "ScientificArticle"
  17. self._setup_schema()
  18. def _connect_to_weaviate(self):
  19. """Ühenda Weaviate'iga Weaviate 4.19.0 süntaksiga"""
  20. try:
  21. url = config.weaviate_url
  22. self.logger.info(f"Ühendan Weaviate: {url}")
  23. # Eemalda http:// või https:// kui on
  24. parsed_url = urlparse(url)
  25. if not parsed_url.scheme:
  26. # Kui pole protokolli, lisa http://
  27. url = f"http://{url}"
  28. parsed_url = urlparse(url)
  29. host = parsed_url.hostname
  30. port = parsed_url.port or (443 if parsed_url.scheme == 'https' else 80)
  31. secure = parsed_url.scheme == 'https'
  32. self.logger.info(f"Parsitud: host={host}, port={port}, secure={secure}")
  33. # Uus süntaks Weaviate 4.19.0 jaoks
  34. if config.weaviate_api_key:
  35. # Kui on API võti
  36. auth_credentials = weaviate.auth.AuthApiKey(config.weaviate_api_key)
  37. client = weaviate.WeaviateClient(
  38. connection_params=weaviate.connect.ConnectionParams.from_params(
  39. http_host=host,
  40. http_port=port,
  41. http_secure=secure,
  42. grpc_host=host,
  43. grpc_port=50051,
  44. grpc_secure=secure,
  45. auth_credentials=auth_credentials
  46. )
  47. )
  48. else:
  49. # Ilma autentimiseta
  50. client = weaviate.WeaviateClient(
  51. connection_params=weaviate.connect.ConnectionParams.from_params(
  52. http_host=host,
  53. http_port=port,
  54. http_secure=secure,
  55. grpc_host=host,
  56. grpc_port=50051,
  57. grpc_secure=secure
  58. )
  59. )
  60. # Ühenda
  61. client.connect()
  62. self.logger.info(f"Ühendatud Weaviate'iga: {host}:{port}")
  63. return client
  64. except Exception as e:
  65. self.logger.error(f"Viga Weaviate'iga ühendumisel: {str(e)}")
  66. self.logger.info("Proovin alternatiivset ühendusviisi...")
  67. return self._connect_fallback()
  68. def _connect_fallback(self):
  69. """Alternatiivne ühendusviis"""
  70. try:
  71. url = config.weaviate_url
  72. self.logger.info(f"Alternatiivne ühendus: {url}")
  73. # Lihtsam viis
  74. if not url.startswith(("http://", "https://")):
  75. url = f"http://{url}"
  76. # Kasutame otse weaviate.connect_to_weaviate funktsiooni
  77. if config.weaviate_api_key:
  78. client = weaviate.connect_to_weaviate(
  79. cluster_url=url,
  80. auth_credentials=weaviate.auth.AuthApiKey(config.weaviate_api_key)
  81. )
  82. else:
  83. # Kui on localhost, kasuta connect_to_local
  84. if "localhost" in url or "127.0.0.1" in url:
  85. # Eralda host ja port
  86. parsed = urlparse(url)
  87. host = parsed.hostname
  88. port = parsed.port or 8080
  89. client = weaviate.connect_to_local(
  90. host=host,
  91. port=port
  92. )
  93. else:
  94. client = weaviate.connect_to_weaviate(
  95. cluster_url=url
  96. )
  97. self.logger.info(f"Alternatiivne ühendus õnnestus: {url}")
  98. return client
  99. except Exception as e:
  100. self.logger.error(f"Mõlemad ühendusviisid ebaõnnestusid: {str(e)}")
  101. raise ConnectionError(f"Ei saanud Weaviate'iga ühendust: {str(e)}")
  102. def _setup_schema(self):
  103. """Loo või kontrolli Weaviate skeemi"""
  104. try:
  105. # Kontrolli, kas klass on olemas
  106. if self.client.collections.exists(self.class_name):
  107. self.logger.info(f"Klass {self.class_name} on juba olemas")
  108. return
  109. # Loo uus klass
  110. self.client.collections.create(
  111. name=self.class_name,
  112. # Kasutame oma embeddinguid
  113. vector_config=Configure.Vector.none(),
  114. properties=[
  115. Property(
  116. name="article_id",
  117. data_type=DataType.TEXT,
  118. description="Artikli unikaalne ID"
  119. ),
  120. Property(
  121. name="title",
  122. data_type=DataType.TEXT,
  123. description="Artikli pealkiri"
  124. ),
  125. Property(
  126. name="authors",
  127. data_type=DataType.TEXT_ARRAY,
  128. description="Artikli autorid"
  129. ),
  130. Property(
  131. name="year",
  132. data_type=DataType.INT,
  133. description="Avaldamisaasta"
  134. ),
  135. Property(
  136. name="journal",
  137. data_type=DataType.TEXT,
  138. description="Žurnaal"
  139. ),
  140. Property(
  141. name="doi",
  142. data_type=DataType.TEXT,
  143. description="DOI identifikaator"
  144. ),
  145. Property(
  146. name="abstract_en",
  147. data_type=DataType.TEXT,
  148. description="Inglise keelne abstrakt"
  149. ),
  150. Property(
  151. name="summary_et",
  152. data_type=DataType.TEXT,
  153. description="Eesti keelne kokkuvõte"
  154. ),
  155. Property(
  156. name="key_concepts",
  157. data_type=DataType.TEXT_ARRAY,
  158. description="Võtmesõnad ja mõisted"
  159. ),
  160. Property(
  161. name="methods_used",
  162. data_type=DataType.TEXT_ARRAY,
  163. description="Kasutatud meetodid"
  164. ),
  165. Property(
  166. name="transport_context",
  167. data_type=DataType.TEXT,
  168. description="Transpordi konteksti analüüs"
  169. ),
  170. Property(
  171. name="relevance_score",
  172. data_type=DataType.INT,
  173. description="Relevantsus skoor 1-10"
  174. ),
  175. Property(
  176. name="processing_date",
  177. data_type=DataType.TEXT,
  178. description="Töötlemise kuupäev"
  179. ),
  180. Property(
  181. name="source_file",
  182. data_type=DataType.TEXT,
  183. description="Algne PDF fail"
  184. ),
  185. Property(
  186. name="file_hash",
  187. data_type=DataType.TEXT,
  188. description="Faili hash duplikaatide kontrolliks"
  189. )
  190. ]
  191. )
  192. self.logger.info(f"Loodi klass: {self.class_name}")
  193. except Exception as e:
  194. self.logger.error(f"Viga skeemi seadistamisel: {str(e)}")
  195. # Võib olla klass juba olemas
  196. pass
  197. def generate_article_id(self, article_data: Dict) -> str:
  198. """Genereeri artikli unikaalne ID"""
  199. unique_string = ""
  200. if article_data.get('doi'):
  201. unique_string = article_data['doi']
  202. elif article_data.get('file_hash'):
  203. unique_string = article_data['file_hash']
  204. else:
  205. title = article_data.get('title', '')
  206. authors = article_data.get('authors', [])
  207. unique_string = f"{title}_{'_'.join(authors[:3])}"
  208. # Loo MD5 hash
  209. article_id = hashlib.md5(unique_string.encode()).hexdigest()
  210. return article_id
  211. def article_exists(self, article_id: str) -> bool:
  212. """Kontrolli, kas artikkel on juba baasis"""
  213. try:
  214. collection = self.client.collections.get(self.class_name)
  215. response = collection.query.fetch_objects(
  216. filters=Filter.by_property("article_id").equal(article_id),
  217. limit=1
  218. )
  219. return len(response.objects) > 0
  220. except Exception as e:
  221. self.logger.error(f"Viga artikli olemasolu kontrollimisel: {str(e)}")
  222. return False
  223. def save_article(self, article_data: Dict, embeddings: Dict) -> bool:
  224. """Salvesta artikkel Weaviate'i"""
  225. try:
  226. # Genereeri artikkel ID
  227. article_id = self.generate_article_id(article_data)
  228. # Kontrolli duplikaati
  229. if self.article_exists(article_id):
  230. self.logger.info(f"Artikkel {article_id} on juba olemas, jätan vahele")
  231. return False
  232. # Valmistame andmed ette
  233. properties = {
  234. "article_id": article_id,
  235. "title": article_data.get('title', ''),
  236. "authors": article_data.get('authors', []),
  237. "year": int(article_data.get('year', 0)) if str(article_data.get('year', '0')).isdigit() else 0,
  238. "journal": article_data.get('journal', ''),
  239. "doi": article_data.get('doi', ''),
  240. "abstract_en": article_data.get('abstract_en', ''),
  241. "summary_et": article_data.get('summary_et', ''),
  242. "key_concepts": article_data.get('key_concepts', []),
  243. "methods_used": article_data.get('methods_used', []),
  244. "transport_context": json.dumps(article_data.get('transport_context', {}), ensure_ascii=False),
  245. "relevance_score": article_data.get('relevance_score', 5),
  246. "processing_date": article_data.get('processing_date', ''),
  247. "source_file": article_data.get('source_file', ''),
  248. "file_hash": article_data.get('file_hash', '')
  249. }
  250. # Kasutame kokkuvõtte embeddingut vektorina
  251. vector = embeddings.get('summary', [])
  252. # Salvesta Weaviate'i
  253. collection = self.client.collections.get(self.class_name)
  254. # Lisa objekt koos vektoriga
  255. collection.data.insert(
  256. properties=properties,
  257. vector=vector
  258. )
  259. self.logger.info(f"Artikkel salvestatud: {article_id}")
  260. return True
  261. except Exception as e:
  262. self.logger.error(f"Viga artikli salvestamisel: {str(e)}")
  263. return False
  264. def search_articles(self, query: str, limit: int = 10) -> List[Dict]:
  265. """Otsi artikleid"""
  266. try:
  267. collection = self.client.collections.get(self.class_name)
  268. # Lihtne otsing
  269. response = collection.query.bm25(
  270. query=query,
  271. query_properties=["title", "summary_et", "abstract_en", "key_concepts"],
  272. limit=limit
  273. )
  274. results = []
  275. for obj in response.objects:
  276. article_data = {
  277. 'article_id': obj.properties.get('article_id'),
  278. 'title': obj.properties.get('title'),
  279. 'authors': obj.properties.get('authors', []),
  280. 'year': obj.properties.get('year'),
  281. 'summary': obj.properties.get('summary_et', '')[:500] + '...',
  282. 'relevance_score': obj.properties.get('relevance_score'),
  283. 'score': obj.metadata.score
  284. }
  285. results.append(article_data)
  286. return results
  287. except Exception as e:
  288. self.logger.error(f"Viga otsingul: {str(e)}")
  289. return []
  290. def close(self):
  291. """Sulge Weaviate ühendus"""
  292. if hasattr(self, 'client') and self.client:
  293. try:
  294. self.client.close()
  295. except:
  296. pass