Merge pull request #8506 from AtKristijan/deduplication-of-resource-i…

…mport Fix bug with import resource when there are duplicates.
learningequality · Mar 19, 2022 · db62f2b · db62f2b
2 parents 0120fc2 + 936868e
commit db62f2b
Show file tree

Hide file tree

Showing 7 changed files with 13 additions and 23 deletions.
diff --git a/kolibri/core/content/management/commands/importcontent.py b/kolibri/core/content/management/commands/importcontent.py
@@ -413,7 +413,6 @@ def _transfer(  # noqa: max-complexity=16
             ContentNode.objects.filter(channel_id=channel_id, available=True)
             .exclude(kind=content_kinds.TOPIC)
             .values("content_id")
-            .distinct()
             .count()
         )
 

diff --git a/kolibri/core/content/utils/annotation.py b/kolibri/core/content/utils/annotation.py
@@ -766,10 +766,7 @@ def calculate_published_size(channel):
 def calculate_total_resource_count(channel):
     content_nodes = ContentNode.objects.filter(channel_id=channel.id)
     channel.total_resource_count = (
-        content_nodes.filter(available=True)
-        .exclude(kind=content_kinds.TOPIC)
-        .dedupe_by_content_id()
-        .count()
+        content_nodes.filter(available=True).exclude(kind=content_kinds.TOPIC).count()
     )
     channel.save()
 

diff --git a/kolibri/core/content/utils/import_export_content.py b/kolibri/core/content/utils/import_export_content.py
@@ -138,8 +138,7 @@ def get_import_export_data(  # noqa: C901
     )
 
     queried_file_objects = {}
-
-    content_ids = set()
+    number_of_resources = 0
 
     while min_boundary < max_rght:
 
@@ -170,13 +169,11 @@ def get_import_export_data(  # noqa: C901
                 )
             )
 
-        included_content_ids = nodes_segment.values_list(
-            "content_id", flat=True
-        ).distinct()
+        count_content_ids = nodes_segment.count()
 
         # Only bother with this query if there were any resources returned above.
-        if included_content_ids:
-            content_ids.update(included_content_ids)
+        if count_content_ids:
+            number_of_resources = number_of_resources + count_content_ids
             file_objects = LocalFile.objects.filter(
                 files__contentnode__in=nodes_segment
             ).values("id", "file_size", "extension")
@@ -207,8 +204,7 @@ def get_import_export_data(  # noqa: C901
     files_to_download = list(queried_file_objects.values())
 
     total_bytes_to_transfer = sum(map(lambda x: x["file_size"] or 0, files_to_download))
-
-    return len(content_ids), files_to_download, total_bytes_to_transfer
+    return number_of_resources, files_to_download, total_bytes_to_transfer
 
 
 def retry_import(e):

diff --git a/kolibri/core/content/utils/upgrade.py b/kolibri/core/content/utils/upgrade.py
@@ -566,7 +566,6 @@ def get_import_data_for_update(
     i = 0
 
     updated_ids_slice = updated_resource_ids[i : i + batch_size]
-
     nodes_to_include = ContentNode.objects.filter(channel_id=channel_id)
 
     # if requested, filter out nodes we're not able to render

diff --git a/kolibri/core/public/test/test_api.py b/kolibri/core/public/test/test_api.py
@@ -204,7 +204,7 @@ def test_public_channel_lookup(self):
             "name": "science",
             "language": "es",  # root node language
             "description": "",
-            "total_resource_count": 2,  # should account for nodes with duplicate content_ids
+            "total_resource_count": 3,  # should account for nodes with duplicate content_ids
             "version": 0,
             "published_size": 20,
             "last_published": None,

diff --git a/kolibri/plugins/device/api.py b/kolibri/plugins/device/api.py
@@ -48,8 +48,7 @@ def to_representation(self, instance):
                     # count the total number of renderable non-topic resources in the channel
                     # (note: it's faster to count them all and then subtract the unrenderables, of which there are fewer)
                     value["total_resources"] = (
-                        channel_nodes.dedupe_by_content_id().count()
-                        - unrenderable_nodes.dedupe_by_content_id().count()
+                        channel_nodes.count() - unrenderable_nodes.count()
                     )
 
                 if "total_file_size" in include_fields:

diff --git a/kolibri/plugins/device/test/test_api.py b/kolibri/plugins/device/test/test_api.py
@@ -175,8 +175,8 @@ def test_channelmetadata_resource_info(self):
             get_params,
         )
         # N.B. Because of our not very good fixture data, most of our content nodes are by default not renderable
-        # Hence this will return 1 if everything is deduped properly.
-        self.assertEqual(response.data["total_resources"], 1)
+        # Hence this will return 2 if everything is duplicated properly.
+        self.assertEqual(response.data["total_resources"], 2)
         self.assertEqual(response.data["total_file_size"], 0)
         self.assertEqual(response.data["on_device_resources"], 4)
         self.assertEqual(response.data["on_device_file_size"], 0)
@@ -188,7 +188,7 @@ def test_channelmetadata_include_fields_filter_has_total_resources(self):
             reverse("kolibri:kolibri.plugins.device:device_channel-list"),
             {"include_fields": "total_resources"},
         )
-        self.assertEqual(response.data[0]["total_resources"], 1)
+        self.assertEqual(response.data[0]["total_resources"], 2)
 
     def test_channelmetadata_include_fields_filter_has_total_file_size(self):
         LocalFile.objects.filter(
@@ -263,7 +263,7 @@ def test_all_nodes_present_studio(self):
             data={"channel_id": self.the_channel_id},
             format="json",
         )
-        self.assertEqual(response.data["resource_count"], 2)
+        self.assertEqual(response.data["resource_count"], 3)
         self.assertEqual(
             response.data["file_size"],
             sum(
@@ -310,7 +310,7 @@ def test_all_nodes_present_export(self):
             data={"channel_id": self.the_channel_id, "export": True},
             format="json",
         )
-        self.assertEqual(response.data["resource_count"], 2)
+        self.assertEqual(response.data["resource_count"], 3)
         self.assertEqual(
             response.data["file_size"],
             sum(
-Original file line number
+Diff line change
@@ Expand Up / @@ -566,7 +566,6 @@ def get_import_data_for_update( @@
         i = 0
         updated_ids_slice = updated_resource_ids[i : i + batch_size]
         nodes_to_include = ContentNode.objects.filter(channel_id=channel_id)
         # if requested, filter out nodes we're not able to render
@@ Expand Down @@