Use proper script to get all Data Streams for a Data Set rather than rely on reference. Test it.

72a0730d · Ivan Tyagov · eb51cb86 · 72a0730d · 72a0730d · 72a0730d
Commit 72a0730d authored May 06, 2020 by Ivan Tyagov
3 changed files
--- a/bt5/erp5_wendelin_data_lake_ingestion/SkinTemplateItem/portal_skins/erp5_wendelin_data_lake/DataSet_getDataStreamList.py
+++ b/bt5/erp5_wendelin_data_lake_ingestion/SkinTemplateItem/portal_skins/erp5_wendelin_data_lake/DataSet_getDataStreamList.py
@@ -4,7 +4,7 @@

  Note: This code is quite computationally costly (for Data Streams having thousands of iles) as it needs to:
  1. Query MariaDB to find ingestion lines
-  2. Read from ZODB both Data Ingestion Lines and Data Streams (whoch itself can be big too)
+  2. Read from ZODB both Data Ingestion Lines and Data Streams (which itself can be big too)
 """
 data_ingestion_line_list = context.portal_catalog(
                             portal_type = "Data Ingestion Line",

--- a/bt5/erp5_wendelin_data_lake_ingestion/SkinTemplateItem/portal_skins/erp5_wendelin_data_lake/getDataStreamList.py
+++ b/bt5/erp5_wendelin_data_lake_ingestion/SkinTemplateItem/portal_skins/erp5_wendelin_data_lake/getDataStreamList.py
+"""
+This script is called from ebulk client to get list of Data Streams for a 
+Data set.
+"""
+
 import re
 import json
 from Products.ERP5Type.Log import log
@@ -20,14 +25,8 @@ data_set = portal.data_set_module.get(data_set_reference)
 if data_set is None:
  return []

-# XXX: use DataSet_getDataStreamList instead!
-query_dict = {
-  "portal_type": "Data Stream",
-  "reference": data_set.getReference() + reference_separator + "%"}
-
 data_stream_list = []
-
-for stream in portal_catalog(**query_dict):
+for stream in data_set.DataSet_getDataStreamList():
  if stream.getVersion() == "":
    return { "status_code": 2, "result": [] }
  data_stream_list.append({ 'id': 'data_stream_module/'+stream.getId(),

--- a/bt5/erp5_wendelin_data_lake_ingestion/TestTemplateItem/portal_components/test.erp5.testDataLakeIngestion.py
+++ b/bt5/erp5_wendelin_data_lake_ingestion/TestTemplateItem/portal_components/test.erp5.testDataLakeIngestion.py
@@ -84,7 +84,6 @@ class TestDataIngestion(SecurityTestCase):

  def ingest(self, data_chunk, reference, extension, eof, randomize_ingestion_reference=False):
    ingestion_reference = self.getIngestionReference(reference, extension, randomize_ingestion_reference)
-    self.portal.log(ingestion_reference)
    
    # use default ebulk policy
    ingestion_policy = self.portal.portal_ingestion_policies.wendelin_embulk
@@ -197,6 +196,9 @@ class TestDataIngestion(SecurityTestCase):
    """
    data_set, data_stream_list = self.stepIngest(self.CSV, ",", randomize_ingestion_reference=True)
    self.tic()
+
+    # check data relation between Data Set and Data Streams work
+    self.assertSameSet(data_stream_list, data_set.DataSet_getDataStreamList())
    
    # publish data set and have all Data Streams publsihed automatically
    data_set.publish()