Python how identify if an instance has been changed without making a copy?
Question:
I’m trying to write a function to determine if an object has been modified at one point during the execution of the program.
I don’t want to duplicate the object because this will take a lot of memory.
My object is a dataclass and has a few lists of dataclasses that might have nested dataclasses within them, but at the bottom level you’ll only find primitive variables (str, ints, bool, …)
Since these objects need to be modifiable I can’t use frozen=True
. What I’ve come up so far is hash(str(self)) == PreviousHash
but this starts slowing down greatly as the amount of data increases.
What would you do to be able get a "hash" of a dataclass instance like this without having to do a slow convertion to a string first?
Answers:
It’s not a foolproof solution, but you could use a Proxy class to wrap your dataclasses recursively and set a flag whenever anything is modified. It’ll impact performance somewhat by adding indirections and dynamic behavior, but at least it won’t get worse as the dataclasses get larger.
Here’s a first draft at it.
class Proxy:
"""
Wraps an object to keep track of modifications, including to its children.
"""
def __init__(self, obj, modified_flag=None):
# Must use `super().__setattr__` to avoid recursing on itself.
super().__setattr__('_obj', obj)
super().__setattr__('_modified_flag', modified_flag or [False])
@property
def is_modified(self):
""" Returns True if any object in this tree has been modified. """
return self._modified_flag[0]
def _set_modified(self):
self._modified_flag[0] = True
def _wrap_subvalue(self, value):
"""
Given an attribute or index value, decides if it should be returned as-is
(e.g. primitive types), wrapped in another Proxy (e.g. substructures), or
if it's a modifying function call and the respective flag should be set.
"""
if isinstance(value, (int, str, float, bool, bytes)):
return value
elif callable(value):
# List of functions that modify the object.
if value.__qualname__ in ('list.append', 'list.pop', 'list.clear', 'list.extend', 'list.insert', 'list.remove', 'list.sort', 'list.reverse', 'dict.popitem', 'dict.update', 'dict.pop', 'dict.clear'):
self._set_modified()
return value
else:
return Proxy(obj=value, modified_flag=self._modified_flag)
def __getattr__(self, name):
return self._wrap_subvalue(getattr(self._obj, name))
def __setattr__(self, name, value):
self._set_modified()
setattr(self._obj, name, value)
def __getitem__(self, index):
return self._wrap_subvalue(self._obj[index])
def __setitem__(self, index, value):
self._set_modified()
self._obj[index] = value
And you’d use it like this:
from dataclasses import dataclass
@dataclass
class Child:
value: float
@dataclass
class Parent:
"""Class for keeping track of an item in inventory."""
name: str
children: [Child]
def bogus_operation(self) -> float:
return sum(child.value for child in self.children) / len(self.children)
raw_parent = Parent('parent name', [Child(value=5), Child(value=4)])
parent = Proxy(raw_parent) # Proxy here!
parent.bogus_operation()
print(parent.is_modified)
parent.children[0].value = 2
parent.children.append(Child(1))
print(parent.is_modified)
Does this work?
def has_object_been_modified(obj):
"""Checks if an object has been modified during program execution.
Returns True if the object has been modified, False otherwise.
"""
original_hash = hash(obj)
current_hash = hash(obj)
if current_hash != original_hash:
return True
return False
my_list = [1, 2, 3, 4]
has_been_modified = has_object_been_modified(my_list)
if has_been_modified:
print("The list has been modified!")
else:
print("The list has not been modified.")
I found something that works and isn’t as slow as hash(str(instance)) but still slows down as the data increases (it’s just within my tolerances).
Note: Slower for very small instances but much faster for very large instances.
I won’t accept it since it didn’t answer my specific question but in case this works for someone I’ll post it here.
from dataclasses import dataclass, field
import os
from pathlib import Path
import pickle
@dataclass()
class Investigation:
Path: str
PreviousHash: int = field(default=0)
Users: list[User] = field(default_factory=list)
Tasks: list[Task] = field(default_factory=list)
Nodes: list[Node] = field(default_factory=list)
Edges: list[Edge] = field(default_factory=list)
@property
def Name(self) -> str:
return Path(self.Path).stem
@property
def Directory(self) -> str:
return os.path.dirname(self.Path)
def Snapshot(self):
self.PreviousHash = hash(pickle.dumps(self))
@property
def HasChanges(self) -> bool:
return hash(pickle.dumps(self)) != self.PreviousHash
def __getstate__(self):
state = self.__dict__.copy()
state["PreviousHash"] = 0
return state
def __setstate__(self, state):
self.__dict__.update(state)
self.PreviousHash = hash(pickle.dumps(self))
I’m trying to write a function to determine if an object has been modified at one point during the execution of the program.
I don’t want to duplicate the object because this will take a lot of memory.
My object is a dataclass and has a few lists of dataclasses that might have nested dataclasses within them, but at the bottom level you’ll only find primitive variables (str, ints, bool, …)
Since these objects need to be modifiable I can’t use frozen=True
. What I’ve come up so far is hash(str(self)) == PreviousHash
but this starts slowing down greatly as the amount of data increases.
What would you do to be able get a "hash" of a dataclass instance like this without having to do a slow convertion to a string first?
It’s not a foolproof solution, but you could use a Proxy class to wrap your dataclasses recursively and set a flag whenever anything is modified. It’ll impact performance somewhat by adding indirections and dynamic behavior, but at least it won’t get worse as the dataclasses get larger.
Here’s a first draft at it.
class Proxy:
"""
Wraps an object to keep track of modifications, including to its children.
"""
def __init__(self, obj, modified_flag=None):
# Must use `super().__setattr__` to avoid recursing on itself.
super().__setattr__('_obj', obj)
super().__setattr__('_modified_flag', modified_flag or [False])
@property
def is_modified(self):
""" Returns True if any object in this tree has been modified. """
return self._modified_flag[0]
def _set_modified(self):
self._modified_flag[0] = True
def _wrap_subvalue(self, value):
"""
Given an attribute or index value, decides if it should be returned as-is
(e.g. primitive types), wrapped in another Proxy (e.g. substructures), or
if it's a modifying function call and the respective flag should be set.
"""
if isinstance(value, (int, str, float, bool, bytes)):
return value
elif callable(value):
# List of functions that modify the object.
if value.__qualname__ in ('list.append', 'list.pop', 'list.clear', 'list.extend', 'list.insert', 'list.remove', 'list.sort', 'list.reverse', 'dict.popitem', 'dict.update', 'dict.pop', 'dict.clear'):
self._set_modified()
return value
else:
return Proxy(obj=value, modified_flag=self._modified_flag)
def __getattr__(self, name):
return self._wrap_subvalue(getattr(self._obj, name))
def __setattr__(self, name, value):
self._set_modified()
setattr(self._obj, name, value)
def __getitem__(self, index):
return self._wrap_subvalue(self._obj[index])
def __setitem__(self, index, value):
self._set_modified()
self._obj[index] = value
And you’d use it like this:
from dataclasses import dataclass
@dataclass
class Child:
value: float
@dataclass
class Parent:
"""Class for keeping track of an item in inventory."""
name: str
children: [Child]
def bogus_operation(self) -> float:
return sum(child.value for child in self.children) / len(self.children)
raw_parent = Parent('parent name', [Child(value=5), Child(value=4)])
parent = Proxy(raw_parent) # Proxy here!
parent.bogus_operation()
print(parent.is_modified)
parent.children[0].value = 2
parent.children.append(Child(1))
print(parent.is_modified)
Does this work?
def has_object_been_modified(obj):
"""Checks if an object has been modified during program execution.
Returns True if the object has been modified, False otherwise.
"""
original_hash = hash(obj)
current_hash = hash(obj)
if current_hash != original_hash:
return True
return False
my_list = [1, 2, 3, 4]
has_been_modified = has_object_been_modified(my_list)
if has_been_modified:
print("The list has been modified!")
else:
print("The list has not been modified.")
I found something that works and isn’t as slow as hash(str(instance)) but still slows down as the data increases (it’s just within my tolerances).
Note: Slower for very small instances but much faster for very large instances.
I won’t accept it since it didn’t answer my specific question but in case this works for someone I’ll post it here.
from dataclasses import dataclass, field
import os
from pathlib import Path
import pickle
@dataclass()
class Investigation:
Path: str
PreviousHash: int = field(default=0)
Users: list[User] = field(default_factory=list)
Tasks: list[Task] = field(default_factory=list)
Nodes: list[Node] = field(default_factory=list)
Edges: list[Edge] = field(default_factory=list)
@property
def Name(self) -> str:
return Path(self.Path).stem
@property
def Directory(self) -> str:
return os.path.dirname(self.Path)
def Snapshot(self):
self.PreviousHash = hash(pickle.dumps(self))
@property
def HasChanges(self) -> bool:
return hash(pickle.dumps(self)) != self.PreviousHash
def __getstate__(self):
state = self.__dict__.copy()
state["PreviousHash"] = 0
return state
def __setstate__(self, state):
self.__dict__.update(state)
self.PreviousHash = hash(pickle.dumps(self))