瀏覽代碼

Kõik artiklid

Ardo Kubjas 3 月之前
父節點
當前提交
a5a1a6acf9
共有 1 個文件被更改,包括 49 次插入26 次删除
  1. 49 26
      fetch_articles/fetch_and_convert.py

+ 49 - 26
fetch_articles/fetch_and_convert.py

@@ -15,36 +15,59 @@ QUERY = {
 }
 
 def fetch_articles():
-    """Toob artiklid Weaviate GraphQL API-st"""
+    """Toob artiklid Weaviate GraphQL API-st (KÕIK limit + offset-iga)"""
     print("📡 Toon artikleid GraphQL API-st...")
     
-    try:
-        result = subprocess.run(
-            [
-                'curl',
-                '-s',
-                'http://100.80.222.54:9020/v1/graphql',
-                '-X', 'POST',
-                '-H', 'Content-Type: application/json',
-                '-d', json.dumps(QUERY)
-            ],
-            capture_output=True,
-            text=True,
-            timeout=30
-        )
+    all_articles = []
+    limit = 500
+    offset = 0
+    
+    while True:
+        query_str = f"""{{
+          Get {{
+            ScientificArticle(limit: {limit}, offset: {offset}) {{
+              title
+              source_file
+              summary_et
+              transport_context
+            }}
+          }}
+        }}"""
         
-        if result.returncode != 0:
-            print(f"❌ CURL viga: {result.stderr}")
-            return None
-            
-        data = json.loads(result.stdout)
-        articles = data.get('data', {}).get('Get', {}).get('ScientificArticle', [])
-        print(f"✅ Leidsin {len(articles)} artiklit")
-        return articles
+        query = {"query": query_str}
         
-    except Exception as e:
-        print(f"❌ Viga: {e}")
-        return None
+        try:
+            result = subprocess.run(
+                ['curl', '-s', 'http://100.80.222.54:9020/v1/graphql',
+                 '-X', 'POST',
+                 '-H', 'Content-Type: application/json',
+                 '-d', json.dumps(query)],
+                capture_output=True,
+                text=True,
+                timeout=60
+            )
+            
+            if result.returncode != 0:
+                print(f"❌ CURL viga: {result.stderr}")
+                break
+            
+            data = json.loads(result.stdout)
+            articles = data.get('data', {}).get('Get', {}).get('ScientificArticle', [])
+            
+            if not articles:
+                break
+            
+            all_articles.extend(articles)
+            print(f"  ✅ Tõin {len(articles)} artiklit (kokku: {len(all_articles)})")
+            
+            offset += limit
+            
+        except Exception as e:
+            print(f"❌ Viga: {e}")
+            break
+    
+    print(f"✅ LEIDSIN KOKKU {len(all_articles)} ARTIKLIT!")
+    return all_articles
 
 def extract_transport_context(transport_context):
     """Eraldab transport_context JSON-i võtmväljad loetaval kujul"""