Any suggestions to improve my recursive function?
Question:
I have a list of categories objects like the list below. The problem with this list is that I have categories and brands mixed, and I only need to get the brands from this list.
I know which ones are the brands, because if I navigate in the parentCategoryIds, I will get the root parent (which is id: brands, parentCategoryId: None)
categories = [
#brands
{ "id": "brands", "parentCategoryId": None },
{ "id": "ls", "parentCategoryId": "brands" },
{ "id": "bleed", "parentCategoryId": "brands" },
{ "id": "shape", "parentCategoryId": "brands" },
{ "id": "graze", "parentCategoryId": "brands" },
{ "id": "item", "parentCategoryId": "brands" },
{ "id": "install", "parentCategoryId": "brands" },
{ "id": "horror", "parentCategoryId": "brands" },
{ "id": "thanks", "parentCategoryId": "brands" },
{ "id": "scrape", "parentCategoryId": "brands" },
{ "id": "shelter", "parentCategoryId": "brands" },
{ "id": "dynamic", "parentCategoryId": "brands" },
{ "id": "under", "parentCategoryId": "shape" },
{ "id": "right", "parentCategoryId": "shape" },
{ "id": "base", "parentCategoryId": "shape" },
{ "id": "scrap", "parentCategoryId": "shape" },
# categories
{ "id": "root", "parentCategoryId": None },
{ "id": "bark", "parentCategoryId": "rich" },
{ "id": "rich", "parentCategoryId": "sting" },
{ "id": "rich", "parentCategoryId": "sting" },
{ "id": "sting", "parentCategoryId": "root" },
]
To solve this issue, I wrote the function below. But I think it will be very slow, since this list is only an example, the original list has hundred of records.
In this function, I’m navigating through the parents until I find the root (if the root == brand, I know I have a brand, so I can add to a separated list; if not, I just ignore).
I’d would like to know if I can do it better, so if I pass a bigger list, it would not be a problem.
brands = []
def getParent(id):
for obj in categories:
if obj['id'] == id:
return obj
def is_brand(obj):
if obj['id'] == 'brands' and obj['parentCategoryId'] == None:
return True
if obj['id'] == 'root':
return False
if not obj['parentCategoryId'] == None:
return is_brand(getParent(obj['parentCategoryId']))
for obj in categories:
if is_brand(obj):
brands.append(obj)
print(brands)
Answers:
This has a better time complexity O(n * m) where m is the maximum depth of this graph, yours is O(n^2*m) if I calculated correctly.
def get_brands():
brands = []
lookup_categories = dict([(category['id'], category['parentCategoryId']) for category in categories])
for id, parent in lookup_categories.items():
if id == 'brands':
brands.append(id)
continue
while parent is not None and parent != 'brands':
parent = lookup_categories[parent]
if parent:
brands.append(id)
return brands
Another approach I was thinking about is using a disjoint set but I couldn’t find a builtin library that has that data structure in python.
Improving is_brand()
‘s resource consumption looks solving an XY-problem.
The way you present it, it may be avoided as simple as
brands = [
{ "id": "brands", "parentCategoryId": None },
{ "id": "ls", "parentCategoryId": "brands" },
…
{ "id": "scrap", "parentCategoryId": "shape" },
]
non_brands = [
# categories
{ "id": "root", "parentCategoryId": None },
{ "id": "bark", "parentCategoryId": "rich" },
…
]
categories = brands + non_brands
There are brands and categories, described by attributes id and parentCategoryId.
Modelling brands as (Python) objects:
If there are responsibilities for brands and categories alike, there should be a common base class:
class node:
""" Each node has a unique non-None id and a parent. """
def __init__(self, node_id, parent=None):
if node_id is None:
raise hell # handling "unique" left as an exercise
self.id = node_id # note the syntax decoration
self.parent = parent
def is_brand(self):
return False # isinstance(self, brand)
class brand(node):
""" A brand is a node with root category "brands". """
def is_brand(self):
return True
class category(node):
""" A category is a node with root category "root". """
pass
I have a list of categories objects like the list below. The problem with this list is that I have categories and brands mixed, and I only need to get the brands from this list.
I know which ones are the brands, because if I navigate in the parentCategoryIds, I will get the root parent (which is id: brands, parentCategoryId: None)
categories = [
#brands
{ "id": "brands", "parentCategoryId": None },
{ "id": "ls", "parentCategoryId": "brands" },
{ "id": "bleed", "parentCategoryId": "brands" },
{ "id": "shape", "parentCategoryId": "brands" },
{ "id": "graze", "parentCategoryId": "brands" },
{ "id": "item", "parentCategoryId": "brands" },
{ "id": "install", "parentCategoryId": "brands" },
{ "id": "horror", "parentCategoryId": "brands" },
{ "id": "thanks", "parentCategoryId": "brands" },
{ "id": "scrape", "parentCategoryId": "brands" },
{ "id": "shelter", "parentCategoryId": "brands" },
{ "id": "dynamic", "parentCategoryId": "brands" },
{ "id": "under", "parentCategoryId": "shape" },
{ "id": "right", "parentCategoryId": "shape" },
{ "id": "base", "parentCategoryId": "shape" },
{ "id": "scrap", "parentCategoryId": "shape" },
# categories
{ "id": "root", "parentCategoryId": None },
{ "id": "bark", "parentCategoryId": "rich" },
{ "id": "rich", "parentCategoryId": "sting" },
{ "id": "rich", "parentCategoryId": "sting" },
{ "id": "sting", "parentCategoryId": "root" },
]
To solve this issue, I wrote the function below. But I think it will be very slow, since this list is only an example, the original list has hundred of records.
In this function, I’m navigating through the parents until I find the root (if the root == brand, I know I have a brand, so I can add to a separated list; if not, I just ignore).
I’d would like to know if I can do it better, so if I pass a bigger list, it would not be a problem.
brands = []
def getParent(id):
for obj in categories:
if obj['id'] == id:
return obj
def is_brand(obj):
if obj['id'] == 'brands' and obj['parentCategoryId'] == None:
return True
if obj['id'] == 'root':
return False
if not obj['parentCategoryId'] == None:
return is_brand(getParent(obj['parentCategoryId']))
for obj in categories:
if is_brand(obj):
brands.append(obj)
print(brands)
This has a better time complexity O(n * m) where m is the maximum depth of this graph, yours is O(n^2*m) if I calculated correctly.
def get_brands():
brands = []
lookup_categories = dict([(category['id'], category['parentCategoryId']) for category in categories])
for id, parent in lookup_categories.items():
if id == 'brands':
brands.append(id)
continue
while parent is not None and parent != 'brands':
parent = lookup_categories[parent]
if parent:
brands.append(id)
return brands
Another approach I was thinking about is using a disjoint set but I couldn’t find a builtin library that has that data structure in python.
Improving is_brand()
‘s resource consumption looks solving an XY-problem.
The way you present it, it may be avoided as simple as
brands = [
{ "id": "brands", "parentCategoryId": None },
{ "id": "ls", "parentCategoryId": "brands" },
…
{ "id": "scrap", "parentCategoryId": "shape" },
]
non_brands = [
# categories
{ "id": "root", "parentCategoryId": None },
{ "id": "bark", "parentCategoryId": "rich" },
…
]
categories = brands + non_brands
There are brands and categories, described by attributes id and parentCategoryId.
Modelling brands as (Python) objects:
If there are responsibilities for brands and categories alike, there should be a common base class:
class node:
""" Each node has a unique non-None id and a parent. """
def __init__(self, node_id, parent=None):
if node_id is None:
raise hell # handling "unique" left as an exercise
self.id = node_id # note the syntax decoration
self.parent = parent
def is_brand(self):
return False # isinstance(self, brand)
class brand(node):
""" A brand is a node with root category "brands". """
def is_brand(self):
return True
class category(node):
""" A category is a node with root category "root". """
pass