14 KiB
14 KiB
Problems
Data Split
MasterSettingsModelstores: key, question, instructions, category etc....ChecklistItemstores: answer (expected value)- To get a complete "checklist item", need to JOIN both tables.
- Also current
MasterSettingsstore same keyterm for multiple checklists/playbooks.- meaning if a keyterm is changed but for one checklists it will change for others too
- to prevent such scenarios, funny strategies need to be implemented in code
- conclusion unnecessary complexity is introduced.
Funny ways versioning
- versioning means having older versions and newer versions and maintains a relation b/w them tooo so we know what change and how.
- currently no such versioning system is there.
complex queries for simple things
# Current: Get checklist with all items (3 queries + manual merge)
checklist = ChecklistMetadata.objects.get(checklist_id=checklist_id)
checklist_items = ChecklistItem.objects.filter(checklist_id=checklist_id)
keyterm_ids = [item["keyterm_id_id"] for item in checklist_items]
keyterms = MasterSettingsModel.objects.filter(RowKey__in=keyterm_ids)
# Then manually merge...
is_active for checklists
- can be null/none - no active checklist found
- race conditions when switching active checklist
- scope - active for which team?? which contract type???
- can delete active checklist accidentally
Goals
- Reduce Joins and Complexity
- Versioning: Full history for keyterms, checklists, and contracts
Document (File Storage)
class Document():
class DocumentType(models.TextChoices):
CONTRACT = "CONTRACT", "Contract"
CHECKLIST = "CHECKLIST", "Checklist"
PLAYBOOK = "PLAYBOOK", "Playbook"
class FileType(models.TextChoices):
DOC = "DOC", "Word Document (Legacy)"
PDF = "PDF", "PDF"
DOCX = "DOCX", "Word Document"
CSV = "CSV", "CSV"
XLSX = "XLSX", "Excel"
id = models.UUIDField(db_index=True)
organization = models.ForeignKey(
"Organization",
on_delete=models.CASCADE,
related_name="documents"
)
user = models.ForeignKey(
"User",
on_delete=models.SET_NULL,
)
document_type = models.CharField(
max_length=20,
choices=DocumentType.choices
)
name = models.CharField(max_length=500)
blob_url = models.URLField(max_length=2000)
file_type = models.CharField(max_length=20, choices=FileType.choices)
size_bytes = models.BigIntegerField()
total_pages = models.IntegerField(null=True, blank=True)
# versioning ezzz
version = models.IntegerField(default=1)
previous_version = models.ForeignKey(
"self",
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name="next_versions"
)
is_current = models.BooleanField(default=True)
def create_new_version(
self,
blob_url: str,
uploaded_by,
**file_metadata
) -> "Document":
self.is_current = False
self.save(update_fields=["is_current", "updated_at"])
return Document.objects.create(
id=self.id,
previous_version=self,
version=self.version + 1,
is_current=True,
organization=self.organization,
uploaded_by=uploaded_by,
document_type=self.document_type,
name=file_metadata.get("name", self.name),
blob_url=blob_url,
file_type=file_metadata.get("file_type", self.file_type),
size_bytes=file_metadata.get("size_bytes", self.size_bytes),
total_pages=file_metadata.get("total_pages"),
)
@classmethod
def get_current(cls, id: str) -> "Document":
return cls.objects.get(original_id=original_id, is_current=True)
@classmethod
def get_version_history(cls, original_id: str):
return cls.objects.filter(original_id=original_id).order_by("version")
@property
def size_kb(self) -> float:
return self.size_bytes / 1024
Checklist
class Checklist():
document = models.ForeignKey(
"Document",
on_delete=models.SET_NULL,
null=True,
blank=True,
)
id = models.UUIDField(db_index=True)
version = models.IntegerField(default=1)
previous_version = models.ForeignKey(
"self",
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name="next_versions"
)
is_current = models.BooleanField(default=True)
organization = models.ForeignKey(
"Organization",
on_delete=models.CASCADE,
)
created_by = models.ForeignKey(
"User",
on_delete=models.SET_NULL,
null=True,
)
name = models.CharField(max_length=200)
description = models.TextField(blank=True)
Keyterm
class Keyterm():
class KeytermType(models.TextChoices):
ANALYSIS = "ANALYSIS", "Analysis"
CHECKLIST = "CHECKLIST", "Checklist"
METADATA = "METADATA", "Metadata"
id = models.UUIDField(db_index=True)
version = models.IntegerField(default=1)
previous_version = models.ForeignKey(
"self",
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name="next_versions"
)
is_current = models.BooleanField(default=True)
organization = models.ForeignKey(
"Organization",
on_delete=models.CASCADE,
)
key = models.CharField(max_length=500)
question = models.TextField()
instructions = models.TextField(blank=True)
type = models.CharField(
choices=KeytermType.choices,
default=KeytermType.ANALYSIS
)
expected_answer = models.TextField(null=True, blank=True)
is_active = models.BooleanField(default=True)
Contract
class Contract():
class AnalysisStatus(models.TextChoices):
PENDING = "PENDING", "Pending"
ACCEPTED = "ACCEPTED", "Accepted"
REJECTED = "REJECTED", "Rejected"
class ContractType(models.TextChoices):
MSA = "MSA", "msa"
NDA = "NDA", "NDA"
# .... many more
document = models.OneToOneField(
"Document",
on_delete=models.CASCADE,
related_name="contract"
)
organization = models.ForeignKey(
"Organization",
on_delete=models.CASCADE,
related_name="contracts"
)
contract_type = models.CharField(choices=ContractType)
analysis_status = models.CharField(
choices=AnalysisStatus.choices,
default=AnalysisStatus.PENDING
)
analyzed_at = models.DateTimeField(null=True, blank=True)
analyzed_by = models.ForeignKey(
"User",
on_delete=models.SET_NULL,
null=True,
related_name="analyzed_contracts"
)
def get_version_history(self):
return Document.get_version_history(self.document.original_id)
Analysis MODEL
- large unstructured textual data
- Also while fetching might need to fetch multiple rows of data for same contract
- Hence it makes sense to store this data in the Mongo DB document storage.
- Easy to store unstructured data because its just JSON
- Easy to query analysis for any contract
Contract specific data
Problem: COI has tenant_name, property_name etc. MSA has parties, term_length. different contracts have different fields.
Options:
- One table with all nullable fields, but it will be sparse and kinda messy
- Separate table per type (COI, MSA) -> lmaoo more schema tables than users
- JSON field -> flexible but no validations + gets messy after short amount of time
SOLUTION:
class Contract():
document = models.OneToOneField("Document", ...)
organization = models.ForeignKey("Organization", ...)
contract_type = models.CharField(choices=ContractType)
analysis_status = models.CharField(...)
...
class COIContract():
contract = models.OneToOneField(Contract, on_delete=models.CASCADE, related_name="coi_details")
tenant_name = models.CharField()
property_name = models.CharField()
property_unit = models.CharField()
expiry_date = models.DateField()
# ..... ezzzzz
class MSAContract():
contract = models.OneToOneField(Contract, on_delete=models.CASCADE, related_name="msa_details")
party_a = models.CharField(max_length=500)
party_b = models.CharField(max_length=500)
term_months = models.IntegerField(null=True)
auto_renewal = models.BooleanField(default=False)
Strategy Pattern for type specific logic
class IContractHandler():
def create_metadata(self, contract: Contract, metadata: dict) -> Any: ...
def get_metadata(self, contract: Contract) -> dict: ...
class COIHandler:
def create_metadata(self, contract: Contract, metadata: dict) -> COIDetails:
return COIDetails.objects.create(
contract=contract,
tenant_name=metadata.get("tenant_name", ""),
property_name=metadata.get("property_name", ""),
property_unit=metadata.get("property_unit", ""),
expiry_date=metadata.get("expiry_date"),
)
def get_metadata(self, contract: Contract) -> dict:
details = contract.coi_details
return {
"tenant_name": details.tenant_name,
"property_name": details.property_name,
"expiry_date": details.expiry_date,
}
# ... similar methods
class ContractService:
def __init__(self, ...):
@transaction.atomic
def create_contract(self, file: BinaryIO, team_id: str = None, **metadata) -> Contract:
document = self.document_service.upload(file=file, ...)
contract_type = self._detect_contract_type(document)
contract = Contract.objects.create(
document=document,
organization=self.organization,
contract_type=contract_type,
)
# implementation of strategy pattern in actual methods
handler = self.handler.get(contract_type)
handler.create_details(contract, metadata)
return contract
Benefits:
Contracttable stays clean- type specific data is normalized
- no changes to existing code when adding new types, adding new contract type = new detail table + new handler
- lastly, easy to test handlers in isolation
Dependency Injection
class IDocumentService(Protocol):
def upload(self, file: BinaryIO, document_type: str) -> Document: ...
def upload_new_version(self, document_id: str, file: BinaryIO) -> Document: ...
class IChecklistService(Protocol):
def get_active_checklist(self, organization_id: str, contract_type: str, team_id: str = None) -> Checklist: ...
class IContractService(Protocol):
def create_contract(self, file: BinaryIO, team_id: str = None, **metadata) -> Contract: ...
def replace_document(self, contract_id: str, file: BinaryIO) -> Contract: ...
class DocumentService:
def __init__(self, organization: Organization, user: User):
self.organization = organization
self.user = user
def upload(self, file: BinaryIO, document_type: str) -> Document:
pass
def upload_new_version(self, document_id: str, file: BinaryIO) -> Document:
pass
class ContractService:
def __init__(
self,
organization: Organization,
user: User,
document_service: IDocumentService,
checklist_service: IChecklistService
):
self.organization = organization
self.user = user
self.document_service = document_service
self.checklist_service = checklist_service
@transaction.atomic
def create_contract(self, file: BinaryIO, team_id: str = None, **metadata) -> Contract:
document = self.document_service.upload(
file=file,
document_type=Document.DocumentType.CONTRACT,
)
contract_type = self._detect_contract_type(document)
checklist = self.checklist_service.get_active_checklist(
organization_id=str(self.organization.id),
contract_type=contract_type,
team_id=team_id
)
return Contract.objects.create(
document=document,
organization=self.organization,
checklist=checklist,
contract_type=contract_type,
**metadata
)
class ServiceFactory:
def __init__(self, organization: Organization, user: User):
self.organization = organization
self.user = user
self._document_service = None
self._checklist_service = None
@property
def document_service(self) -> DocumentService:
if not self._document_service:
self._document_service = DocumentService(self.organization, self.user)
return self._document_service
@property
def checklist_service(self) -> ChecklistService:
if not self._checklist_service:
self._checklist_service = ChecklistService(self.organization)
return self._checklist_service
def contract_service(self) -> ContractService:
return ContractService(
organization=self.organization,
user=self.user,
document_service=self.document_service,
checklist_service=self.checklist_service
)
usage in views or anything
@api_view(["POST"])
@permission_classes([IsAuthenticated])
def upload_contract(request):
user = request.user
organization = user.organization
factory = ServiceFactory(organization, user)
contract_service = factory.contract_service()
contract = contract_service.create_contract(
file=request.FILES["file"],
)
return Response({"contract_id": str(contract.id)})