## Problems ### **Data Split** - `MasterSettingsModel` stores: key, question, instructions, category etc.... - `ChecklistItem` stores: answer (expected value) - To get a complete "checklist item", need to JOIN both tables. - Also current `MasterSettings` store same keyterm for multiple checklists/playbooks. - meaning if a keyterm is changed but for one checklists it will change for others too - to prevent such scenarios, funny strategies need to be implemented in code - **conclusion** unnecessary complexity is introduced. ### **Funny ways versioning** - versioning means having older versions and newer versions and maintains a relation b/w them tooo so we know what change and how. - currently no such versioning system is there. ### **complex queries for simple things** ```python # Current: Get checklist with all items (3 queries + manual merge) checklist = ChecklistMetadata.objects.get(checklist_id=checklist_id) checklist_items = ChecklistItem.objects.filter(checklist_id=checklist_id) keyterm_ids = [item["keyterm_id_id"] for item in checklist_items] keyterms = MasterSettingsModel.objects.filter(RowKey__in=keyterm_ids) # Then manually merge... ``` ### **`is_active` for checklists** - can be null/none - no active checklist found - race conditions when switching active checklist - scope - active for which team?? which contract type??? - can delete active checklist accidentally --- ### Goals - **Reduce Joins and Complexity** - **Versioning**: Full history for keyterms, checklists, and contracts ### Document (File Storage) ```python class Document(): class DocumentType(models.TextChoices): CONTRACT = "CONTRACT", "Contract" CHECKLIST = "CHECKLIST", "Checklist" PLAYBOOK = "PLAYBOOK", "Playbook" class FileType(models.TextChoices): DOC = "DOC", "Word Document (Legacy)" PDF = "PDF", "PDF" DOCX = "DOCX", "Word Document" CSV = "CSV", "CSV" XLSX = "XLSX", "Excel" id = models.UUIDField(db_index=True) organization = models.ForeignKey( "Organization", on_delete=models.CASCADE, related_name="documents" ) user = models.ForeignKey( "User", on_delete=models.SET_NULL, ) document_type = models.CharField( max_length=20, choices=DocumentType.choices ) name = models.CharField(max_length=500) blob_url = models.URLField(max_length=2000) file_type = models.CharField(max_length=20, choices=FileType.choices) size_bytes = models.BigIntegerField() total_pages = models.IntegerField(null=True, blank=True) # versioning ezzz version = models.IntegerField(default=1) previous_version = models.ForeignKey( "self", on_delete=models.SET_NULL, null=True, blank=True, related_name="next_versions" ) is_current = models.BooleanField(default=True) def create_new_version( self, blob_url: str, uploaded_by, **file_metadata ) -> "Document": self.is_current = False self.save(update_fields=["is_current", "updated_at"]) return Document.objects.create( id=self.id, previous_version=self, version=self.version + 1, is_current=True, organization=self.organization, uploaded_by=uploaded_by, document_type=self.document_type, name=file_metadata.get("name", self.name), blob_url=blob_url, file_type=file_metadata.get("file_type", self.file_type), size_bytes=file_metadata.get("size_bytes", self.size_bytes), total_pages=file_metadata.get("total_pages"), ) @classmethod def get_current(cls, id: str) -> "Document": return cls.objects.get(original_id=original_id, is_current=True) @classmethod def get_version_history(cls, original_id: str): return cls.objects.filter(original_id=original_id).order_by("version") @property def size_kb(self) -> float: return self.size_bytes / 1024 ``` ### Checklist ```python class Checklist(): document = models.ForeignKey( "Document", on_delete=models.SET_NULL, null=True, blank=True, ) id = models.UUIDField(db_index=True) version = models.IntegerField(default=1) previous_version = models.ForeignKey( "self", on_delete=models.SET_NULL, null=True, blank=True, related_name="next_versions" ) is_current = models.BooleanField(default=True) organization = models.ForeignKey( "Organization", on_delete=models.CASCADE, ) created_by = models.ForeignKey( "User", on_delete=models.SET_NULL, null=True, ) name = models.CharField(max_length=200) description = models.TextField(blank=True) ``` ### Keyterm ```python class Keyterm(): class KeytermType(models.TextChoices): ANALYSIS = "ANALYSIS", "Analysis" CHECKLIST = "CHECKLIST", "Checklist" METADATA = "METADATA", "Metadata" id = models.UUIDField(db_index=True) version = models.IntegerField(default=1) previous_version = models.ForeignKey( "self", on_delete=models.SET_NULL, null=True, blank=True, related_name="next_versions" ) is_current = models.BooleanField(default=True) organization = models.ForeignKey( "Organization", on_delete=models.CASCADE, ) key = models.CharField(max_length=500) question = models.TextField() instructions = models.TextField(blank=True) type = models.CharField( choices=KeytermType.choices, default=KeytermType.ANALYSIS ) expected_answer = models.TextField(null=True, blank=True) is_active = models.BooleanField(default=True) ``` ### Contract ```python class Contract(): class AnalysisStatus(models.TextChoices): PENDING = "PENDING", "Pending" ACCEPTED = "ACCEPTED", "Accepted" REJECTED = "REJECTED", "Rejected" class ContractType(models.TextChoices): MSA = "MSA", "msa" NDA = "NDA", "NDA" # .... many more document = models.OneToOneField( "Document", on_delete=models.CASCADE, related_name="contract" ) organization = models.ForeignKey( "Organization", on_delete=models.CASCADE, related_name="contracts" ) contract_type = models.CharField(choices=ContractType) analysis_status = models.CharField( choices=AnalysisStatus.choices, default=AnalysisStatus.PENDING ) analyzed_at = models.DateTimeField(null=True, blank=True) analyzed_by = models.ForeignKey( "User", on_delete=models.SET_NULL, null=True, related_name="analyzed_contracts" ) def get_version_history(self): return Document.get_version_history(self.document.original_id) ``` ### **Analysis MODEL** - large unstructured textual data - Also while fetching might need to fetch multiple rows of data for same contract - Hence it makes sense to store this data in the Mongo DB document storage. - Easy to store unstructured data because its just JSON - Easy to query analysis for any contract ### Contract specific data **Problem**: COI has tenant_name, property_name etc. MSA has parties, term_length. different contracts have different fields. **Options**: - One table with all nullable fields, but it will be sparse and kinda messy - Separate table per type (COI, MSA) -> lmaoo more schema tables than users - JSON field -> flexible but no validations + gets messy after short amount of time **SOLUTION**: ```python class Contract(): document = models.OneToOneField("Document", ...) organization = models.ForeignKey("Organization", ...) contract_type = models.CharField(choices=ContractType) analysis_status = models.CharField(...) ... class COIContract(): contract = models.OneToOneField(Contract, on_delete=models.CASCADE, related_name="coi_details") tenant_name = models.CharField() property_name = models.CharField() property_unit = models.CharField() expiry_date = models.DateField() # ..... ezzzzz class MSAContract(): contract = models.OneToOneField(Contract, on_delete=models.CASCADE, related_name="msa_details") party_a = models.CharField(max_length=500) party_b = models.CharField(max_length=500) term_months = models.IntegerField(null=True) auto_renewal = models.BooleanField(default=False) ``` ### **Strategy Pattern** for type specific logic ```python class IContractHandler(): def create_metadata(self, contract: Contract, metadata: dict) -> Any: ... def get_metadata(self, contract: Contract) -> dict: ... class COIHandler: def create_metadata(self, contract: Contract, metadata: dict) -> COIDetails: return COIDetails.objects.create( contract=contract, tenant_name=metadata.get("tenant_name", ""), property_name=metadata.get("property_name", ""), property_unit=metadata.get("property_unit", ""), expiry_date=metadata.get("expiry_date"), ) def get_metadata(self, contract: Contract) -> dict: details = contract.coi_details return { "tenant_name": details.tenant_name, "property_name": details.property_name, "expiry_date": details.expiry_date, } # ... similar methods ``` ```python class ContractService: def __init__(self, ...): @transaction.atomic def create_contract(self, file: BinaryIO, team_id: str = None, **metadata) -> Contract: document = self.document_service.upload(file=file, ...) contract_type = self._detect_contract_type(document) contract = Contract.objects.create( document=document, organization=self.organization, contract_type=contract_type, ) # implementation of strategy pattern in actual methods handler = self.handler.get(contract_type) handler.create_details(contract, metadata) return contract ``` **Benefits**: - `Contract` table stays clean - type specific data is normalized - no changes to existing code when adding new types, adding new contract type = new detail table + new handler - lastly, easy to test handlers in isolation --- ### Dependency Injection ```python class IDocumentService(Protocol): def upload(self, file: BinaryIO, document_type: str) -> Document: ... def upload_new_version(self, document_id: str, file: BinaryIO) -> Document: ... class IChecklistService(Protocol): def get_active_checklist(self, organization_id: str, contract_type: str, team_id: str = None) -> Checklist: ... class IContractService(Protocol): def create_contract(self, file: BinaryIO, team_id: str = None, **metadata) -> Contract: ... def replace_document(self, contract_id: str, file: BinaryIO) -> Contract: ... ``` ```python class DocumentService: def __init__(self, organization: Organization, user: User): self.organization = organization self.user = user def upload(self, file: BinaryIO, document_type: str) -> Document: pass def upload_new_version(self, document_id: str, file: BinaryIO) -> Document: pass class ContractService: def __init__( self, organization: Organization, user: User, document_service: IDocumentService, checklist_service: IChecklistService ): self.organization = organization self.user = user self.document_service = document_service self.checklist_service = checklist_service @transaction.atomic def create_contract(self, file: BinaryIO, team_id: str = None, **metadata) -> Contract: document = self.document_service.upload( file=file, document_type=Document.DocumentType.CONTRACT, ) contract_type = self._detect_contract_type(document) checklist = self.checklist_service.get_active_checklist( organization_id=str(self.organization.id), contract_type=contract_type, team_id=team_id ) return Contract.objects.create( document=document, organization=self.organization, checklist=checklist, contract_type=contract_type, **metadata ) ``` ```python class ServiceFactory: def __init__(self, organization: Organization, user: User): self.organization = organization self.user = user self._document_service = None self._checklist_service = None @property def document_service(self) -> DocumentService: if not self._document_service: self._document_service = DocumentService(self.organization, self.user) return self._document_service @property def checklist_service(self) -> ChecklistService: if not self._checklist_service: self._checklist_service = ChecklistService(self.organization) return self._checklist_service def contract_service(self) -> ContractService: return ContractService( organization=self.organization, user=self.user, document_service=self.document_service, checklist_service=self.checklist_service ) ``` --- ### usage in views or anything ```python @api_view(["POST"]) @permission_classes([IsAuthenticated]) def upload_contract(request): user = request.user organization = user.organization factory = ServiceFactory(organization, user) contract_service = factory.contract_service() contract = contract_service.create_contract( file=request.FILES["file"], ) return Response({"contract_id": str(contract.id)}) ```