|
|
@@ -0,0 +1,405 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+"""
|
|
|
+Päringumootor Weaviate'i baasile DeepSeeki API abil
|
|
|
+"""
|
|
|
+
|
|
|
+import json
|
|
|
+import logging
|
|
|
+from typing import List, Dict, Any, Optional
|
|
|
+from tenacity import retry, stop_after_attempt, wait_exponential
|
|
|
+from openai import OpenAI
|
|
|
+
|
|
|
+from .weaviate_client import WeaviateClient
|
|
|
+from .config import config
|
|
|
+from .embedding_generator import EmbeddingGenerator
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+class QueryEngine:
|
|
|
+ """Päringumootor DeepSeeki ja Weaviate'i ühendamiseks"""
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ self.weaviate_client = WeaviateClient()
|
|
|
+ self.deepseek_client = OpenAI(
|
|
|
+ api_key=config.deepseek_api_key,
|
|
|
+ base_url=config.deepseek_base_url
|
|
|
+ )
|
|
|
+ self.embedding_generator = EmbeddingGenerator()
|
|
|
+ self.logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
+ def search_articles_semantic(self, query: str, limit: int = 5) -> List[Dict]:
|
|
|
+ """
|
|
|
+ Otsi artikleid semantiliselt (vektori otsing)
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ # Genereeri otsingupäringu embedding
|
|
|
+ query_embedding = self.embedding_generator.generate_embedding(query)
|
|
|
+
|
|
|
+ # Vektori otsing Weaviate'is
|
|
|
+ collection = self.weaviate_client.client.collections.get("ScientificArticle")
|
|
|
+
|
|
|
+ response = collection.query.near_vector(
|
|
|
+ near_vector=query_embedding,
|
|
|
+ limit=limit,
|
|
|
+ return_metadata=["distance", "score"]
|
|
|
+ )
|
|
|
+
|
|
|
+ results = []
|
|
|
+ for obj in response.objects:
|
|
|
+ # Arvuta skoor: kasuta distance'i või määra vaikimisi
|
|
|
+ distance = obj.metadata.distance if hasattr(obj.metadata, 'distance') else None
|
|
|
+ # Teisenda distance skooriks (väiksem distance = parem)
|
|
|
+ if distance is not None:
|
|
|
+ score = 1.0 / (1.0 + distance) # Teisenda distance skooriks 0-1 vahemikus
|
|
|
+ else:
|
|
|
+ score = 0.5 # Vaikimisi skoor
|
|
|
+
|
|
|
+ # Teisenda UUID stringiks
|
|
|
+ article_id = obj.properties.get('article_id')
|
|
|
+ if article_id:
|
|
|
+ article_id = str(article_id)
|
|
|
+
|
|
|
+ article_data = {
|
|
|
+ 'article_id': article_id, # Nüüd on string
|
|
|
+ 'title': obj.properties.get('title', ''),
|
|
|
+ 'authors': obj.properties.get('authors', []),
|
|
|
+ 'year': obj.properties.get('year', ''),
|
|
|
+ 'journal': obj.properties.get('journal', ''),
|
|
|
+ 'summary': obj.properties.get('summary_et', '')[:500] + '...' if obj.properties.get('summary_et') else '',
|
|
|
+ 'full_summary': obj.properties.get('summary_et', ''),
|
|
|
+ 'key_concepts': obj.properties.get('key_concepts', []),
|
|
|
+ 'methods_used': obj.properties.get('methods_used', []),
|
|
|
+ 'transport_context': obj.properties.get('transport_context', ''),
|
|
|
+ 'relevance_score': obj.properties.get('relevance_score', 5),
|
|
|
+ 'distance': distance,
|
|
|
+ 'score': score # Tagame, et score alati olemas
|
|
|
+ }
|
|
|
+ results.append(article_data)
|
|
|
+
|
|
|
+ # Sorteeri skoori järgi (kõige paremad esimesena)
|
|
|
+ results.sort(key=lambda x: x.get('score', 0) or 0, reverse=True)
|
|
|
+
|
|
|
+ self.logger.info(f"Leidsin {len(results)} artiklit semantilise otsinguga")
|
|
|
+ return results
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ self.logger.error(f"Viga semantilisel otsingul: {str(e)}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ def search_articles_keyword(self, query: str, limit: int = 5) -> List[Dict]:
|
|
|
+ """
|
|
|
+ Otsi artikleid võtmesõnade põhjal (BM25)
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ collection = self.weaviate_client.client.collections.get("ScientificArticle")
|
|
|
+
|
|
|
+ response = collection.query.bm25(
|
|
|
+ query=query,
|
|
|
+ query_properties=["title", "summary_et", "abstract_en", "key_concepts"],
|
|
|
+ limit=limit
|
|
|
+ )
|
|
|
+
|
|
|
+ results = []
|
|
|
+ for obj in response.objects:
|
|
|
+ # BM25 skoor või vaikimisi 0.5
|
|
|
+ score = obj.metadata.score if hasattr(obj.metadata, 'score') else 0.5
|
|
|
+
|
|
|
+ # Teisenda UUID stringiks
|
|
|
+ article_id = obj.properties.get('article_id')
|
|
|
+ if article_id:
|
|
|
+ article_id = str(article_id)
|
|
|
+
|
|
|
+ article_data = {
|
|
|
+ 'article_id': article_id, # Nüüd on string
|
|
|
+ 'title': obj.properties.get('title', ''),
|
|
|
+ 'authors': obj.properties.get('authors', []),
|
|
|
+ 'year': obj.properties.get('year', ''),
|
|
|
+ 'journal': obj.properties.get('journal', ''),
|
|
|
+ 'summary': obj.properties.get('summary_et', '')[:500] + '...' if obj.properties.get('summary_et') else '',
|
|
|
+ 'full_summary': obj.properties.get('summary_et', ''),
|
|
|
+ 'key_concepts': obj.properties.get('key_concepts', []),
|
|
|
+ 'methods_used': obj.properties.get('methods_used', []),
|
|
|
+ 'transport_context': obj.properties.get('transport_context', ''),
|
|
|
+ 'relevance_score': obj.properties.get('relevance_score', 5),
|
|
|
+ 'score': score # Tagame, et score alati olemas
|
|
|
+ }
|
|
|
+ results.append(article_data)
|
|
|
+
|
|
|
+ self.logger.info(f"Leidsin {len(results)} artiklit võtmesõnade otsinguga")
|
|
|
+ return results
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ self.logger.error(f"Viga võtmesõnade otsingul: {str(e)}")
|
|
|
+ return []
|
|
|
+
|
|
|
+ def search_articles_hybrid(self, query: str, limit: int = 5) -> List[Dict]:
|
|
|
+ """
|
|
|
+ Hübriidotsing: kombineerib semantilise ja võtmesõnade otsingu
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ # Tee mõlemad otsingud
|
|
|
+ semantic_results = self.search_articles_semantic(query, limit * 2)
|
|
|
+ keyword_results = self.search_articles_keyword(query, limit * 2)
|
|
|
+
|
|
|
+ # Ühenda ja sorteeri
|
|
|
+ all_results = []
|
|
|
+ seen_ids = set()
|
|
|
+
|
|
|
+ # Lisa esmalt semantilised tulemused
|
|
|
+ for result in semantic_results:
|
|
|
+ if result['article_id'] not in seen_ids:
|
|
|
+ # Tagame, et score on olemas
|
|
|
+ result['score'] = result.get('score', 0.5)
|
|
|
+ all_results.append(result)
|
|
|
+ seen_ids.add(result['article_id'])
|
|
|
+
|
|
|
+ # Lisa seejärel võtmesõnade tulemused
|
|
|
+ for result in keyword_results:
|
|
|
+ if result['article_id'] not in seen_ids:
|
|
|
+ # Tagame, et score on olemas
|
|
|
+ result['score'] = result.get('score', 0.5)
|
|
|
+ all_results.append(result)
|
|
|
+ seen_ids.add(result['article_id'])
|
|
|
+
|
|
|
+ # Sorteeri skoori järgi, kasuta turvalist sortimist
|
|
|
+ all_results.sort(key=lambda x: float(x.get('score', 0)) if x.get('score') is not None else 0.0, reverse=True)
|
|
|
+
|
|
|
+ # Tagasta piiratud arv
|
|
|
+ return all_results[:limit]
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ self.logger.error(f"Viga hübriidotsingul: {str(e)}")
|
|
|
+ # Proovi lihtsamaid otsinguid
|
|
|
+ try:
|
|
|
+ return self.search_articles_semantic(query, limit)
|
|
|
+ except:
|
|
|
+ try:
|
|
|
+ return self.search_articles_keyword(query, limit)
|
|
|
+ except:
|
|
|
+ return []
|
|
|
+
|
|
|
+ @retry(
|
|
|
+ stop=stop_after_attempt(3),
|
|
|
+ wait=wait_exponential(multiplier=1, min=4, max=10)
|
|
|
+ )
|
|
|
+ def query_deepseek(self, messages: List[Dict], temperature: float = 0.7, max_tokens: int = 4000) -> str:
|
|
|
+ """
|
|
|
+ Saada päring DeepSeeki API-le
|
|
|
+
|
|
|
+ Args:
|
|
|
+ messages: Sõnumite list
|
|
|
+ temperature: Temperatuur (0-1)
|
|
|
+ max_tokens: Maksimaalne tokenite arv
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ API vastus
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ response = self.deepseek_client.chat.completions.create(
|
|
|
+ model=config.deepseek_model,
|
|
|
+ messages=messages,
|
|
|
+ max_tokens=max_tokens,
|
|
|
+ temperature=temperature
|
|
|
+ )
|
|
|
+
|
|
|
+ content = response.choices[0].message.content
|
|
|
+ return content
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ self.logger.error(f"DeepSeek API viga: {str(e)}")
|
|
|
+ raise
|
|
|
+
|
|
|
+ def generate_answer(self, query: str, articles: List[Dict], context_type: str = "detailed") -> Dict:
|
|
|
+ """
|
|
|
+ Genereeri vastus DeepSeeki abil leitud artiklite põhjal
|
|
|
+
|
|
|
+ Args:
|
|
|
+ query: Kasutaja päring
|
|
|
+ articles: Leitud artiklid
|
|
|
+ context_type: Konteksti tüüp ('detailed', 'concise', 'technical')
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Vastus sõnastikuna
|
|
|
+ """
|
|
|
+ if not articles:
|
|
|
+ return {
|
|
|
+ "answer": "Kahjuks ei leidnud antud küsimusele vastavaid artikleid.",
|
|
|
+ "sources": [],
|
|
|
+ "confidence": 0.0
|
|
|
+ }
|
|
|
+
|
|
|
+ # Koosta kontekst artiklitest
|
|
|
+ context = self._format_articles_context(articles, context_type)
|
|
|
+
|
|
|
+ # Koosta süsteemiprompt
|
|
|
+ system_prompt = self._get_system_prompt(context_type)
|
|
|
+
|
|
|
+ # Koosta kasutajaprompt
|
|
|
+ user_prompt = self._get_user_prompt(query, context, articles)
|
|
|
+
|
|
|
+ # Saada päring DeepSeeki-le
|
|
|
+ messages = [
|
|
|
+ {"role": "system", "content": system_prompt},
|
|
|
+ {"role": "user", "content": user_prompt}
|
|
|
+ ]
|
|
|
+
|
|
|
+ try:
|
|
|
+ answer = self.query_deepseek(messages, temperature=0.7)
|
|
|
+
|
|
|
+ # Tõlgenda vastust
|
|
|
+ return {
|
|
|
+ "answer": answer,
|
|
|
+ "sources": articles,
|
|
|
+ "confidence": self._calculate_confidence(articles),
|
|
|
+ "context_used": len(articles),
|
|
|
+ "query_type": context_type
|
|
|
+ }
|
|
|
+
|
|
|
+ except Exception as e:
|
|
|
+ self.logger.error(f"Viga vastuse genereerimisel: {str(e)}")
|
|
|
+ return {
|
|
|
+ "answer": f"Vastuse genereerimisel tekkis viga: {str(e)}",
|
|
|
+ "sources": articles,
|
|
|
+ "confidence": 0.3,
|
|
|
+ "context_used": len(articles),
|
|
|
+ "query_type": context_type
|
|
|
+ }
|
|
|
+
|
|
|
+ def _format_articles_context(self, articles: List[Dict], context_type: str) -> str:
|
|
|
+ """Formateeri artiklid kontekstiks"""
|
|
|
+ context_parts = []
|
|
|
+
|
|
|
+ for i, article in enumerate(articles, 1):
|
|
|
+ if context_type == "detailed":
|
|
|
+ context_part = f"""
|
|
|
+ARTIKKEL {i}:
|
|
|
+Pealkiri: {article.get('title', 'N/A')}
|
|
|
+Autorid: {', '.join(article.get('authors', []))}
|
|
|
+Aasta: {article.get('year', 'N/A')}
|
|
|
+Žurnaal: {article.get('journal', 'N/A')}
|
|
|
+Kokkuvõte: {article.get('full_summary', article.get('summary', 'N/A'))}
|
|
|
+Võtmesõnad: {', '.join(article.get('key_concepts', []))}
|
|
|
+Meetodid: {', '.join(article.get('methods_used', []))}
|
|
|
+Relevantsus: {article.get('relevance_score', 5)}/10
|
|
|
+ """
|
|
|
+ elif context_type == "concise":
|
|
|
+ context_part = f"""
|
|
|
+[{i}] {article.get('title', 'N/A')} ({article.get('year', 'N/A')})
|
|
|
+Kokkuvõte: {article.get('summary', 'N/A')}
|
|
|
+ """
|
|
|
+ else: # technical
|
|
|
+ context_part = f"""
|
|
|
+[{i}] {article.get('title', 'N/A')}
|
|
|
+Aasta: {article.get('year', 'N/A')}, Autorid: {', '.join(article.get('authors', []))}
|
|
|
+Meetodid: {', '.join(article.get('methods_used', []))}
|
|
|
+Põhitulemused: {article.get('summary', 'N/A')[:300]}...
|
|
|
+ """
|
|
|
+
|
|
|
+ context_parts.append(context_part)
|
|
|
+
|
|
|
+ return "\n".join(context_parts)
|
|
|
+
|
|
|
+ def _get_system_prompt(self, context_type: str) -> str:
|
|
|
+ """Tagasta süsteemiprompt konteksti tüübi järgi"""
|
|
|
+ if context_type == "technical":
|
|
|
+ return """Sa oled transpordiplaneerimise ekspert ja teadusartiklite analüütik.
|
|
|
+Vasta kasutaja küsimusele põhjalikult ja teaduslikult, viidates allikatele.
|
|
|
+Kasuta selget, asjalikku keelt ja too välja olulisemad teaduslikud leidud.
|
|
|
+Vasta Eesti keeles, vajadusel kasuta inglise keelseid termineid."""
|
|
|
+
|
|
|
+ elif context_type == "concise":
|
|
|
+ return """Sa oheld transpordivaldkonna konsultant.
|
|
|
+Vasta kasutaja küsimusele lühidalt ja asjakohaselt.
|
|
|
+Too välja peamised punktid ja praktilised rakendused.
|
|
|
+Vasta Eesti keeles, ole selge ja arusaadav."""
|
|
|
+
|
|
|
+ else: # detailed
|
|
|
+ return """Sa oled transpordiuuringute spetsialist.
|
|
|
+Vasta kasutaja küsimusele põhjalikult, struktureeritult ja analüütiliselt.
|
|
|
+Too välja nii teoreetilised kui praktilised aspektid, viidates konkreetsetele artiklitele.
|
|
|
+Vasta Eesti keeles, kasuta selget ja täpset keelt.
|
|
|
+Struktureeri vastus järgmiselt:
|
|
|
+1. Ülevaade leitud informatsioonist
|
|
|
+2. Peamised leidud ja tulemused
|
|
|
+3. Praktilised rakendused
|
|
|
+4. Piirangud ja edasised suunad"""
|
|
|
+
|
|
|
+ def _get_user_prompt(self, query: str, context: str, articles: List[Dict]) -> str:
|
|
|
+ """Koosta kasutajaprompt"""
|
|
|
+ return f"""Kasutaja küsimus: {query}
|
|
|
+
|
|
|
+Allpool on toodud {len(articles)} teadusartiklit, mis võivad küsimusele vastamisel abiks olla:
|
|
|
+
|
|
|
+{context}
|
|
|
+
|
|
|
+Palun vasta küsimusele põhjalikult, viidates artiklitele numbritega [1], [2], jne.
|
|
|
+Kui mingi osa küsimusest ei ole artiklitest kajastatud, märgi see eraldi.
|
|
|
+Vasta Eesti keeles."""
|
|
|
+
|
|
|
+ def _calculate_confidence(self, articles: List[Dict]) -> float:
|
|
|
+ """Arvuta usaldusväärsuse skoor"""
|
|
|
+ if not articles:
|
|
|
+ return 0.0
|
|
|
+
|
|
|
+ # Arvuta keskmine skoor ja relevantsus
|
|
|
+ avg_score = sum(article.get('score', 0) for article in articles) / len(articles)
|
|
|
+ avg_relevance = sum(article.get('relevance_score', 5) for article in articles) / len(articles) / 10
|
|
|
+
|
|
|
+ # Kombineeri skoorid
|
|
|
+ confidence = (avg_score * 0.6) + (avg_relevance * 0.4)
|
|
|
+
|
|
|
+ # Piira vahemikus 0-1
|
|
|
+ return max(0.0, min(1.0, confidence))
|
|
|
+
|
|
|
+ def ask(self, query: str, search_type: str = "hybrid", limit: int = 5, context_type: str = "detailed") -> Dict:
|
|
|
+ """
|
|
|
+ Põhipäringufunktsioon
|
|
|
+
|
|
|
+ Args:
|
|
|
+ query: Kasutaja päring
|
|
|
+ search_type: Otsingu tüüp ('semantic', 'keyword', 'hybrid')
|
|
|
+ limit: Tagastatavate artiklite arv
|
|
|
+ context_type: Konteksti tüüp ('detailed', 'concise', 'technical')
|
|
|
+
|
|
|
+ Returns:
|
|
|
+ Vastus sõnastikuna
|
|
|
+ """
|
|
|
+ self.logger.info(f"Päring: '{query}' (tüüp: {search_type}, piirang: {limit})")
|
|
|
+
|
|
|
+ # Otsi artikleid
|
|
|
+ if search_type == "semantic":
|
|
|
+ articles = self.search_articles_semantic(query, limit)
|
|
|
+ elif search_type == "keyword":
|
|
|
+ articles = self.search_articles_keyword(query, limit)
|
|
|
+ else: # hybrid
|
|
|
+ articles = self.search_articles_hybrid(query, limit)
|
|
|
+
|
|
|
+ # Genereeri vastus
|
|
|
+ result = self.generate_answer(query, articles, context_type)
|
|
|
+
|
|
|
+ # Lisa metaandmed
|
|
|
+ result.update({
|
|
|
+ "query": query,
|
|
|
+ "search_type": search_type,
|
|
|
+ "articles_found": len(articles),
|
|
|
+ "timestamp": self._get_timestamp()
|
|
|
+ })
|
|
|
+
|
|
|
+ return result
|
|
|
+
|
|
|
+ def _get_timestamp(self) -> str:
|
|
|
+ """Tagasta ajatempel"""
|
|
|
+ from datetime import datetime
|
|
|
+ return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
|
+
|
|
|
+ def close(self):
|
|
|
+ """Sulge ressursid"""
|
|
|
+ try:
|
|
|
+ self.weaviate_client.close()
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+# CLI skripti jaoks
|
|
|
+def create_query_engine():
|
|
|
+ """Loo päringumootor"""
|
|
|
+ return QueryEngine()
|