Files
spidy-look-at-it/01-schema-refactoring.md
Kulvir Singh b92b43c4bc some updated
2025-12-26 22:39:54 +05:30

13 KiB

Problems

Data Split

  • MasterSettingsModel stores: key, question, instructions, category etc....
  • ChecklistItem stores: answer (expected value)
  • To get a complete "checklist item", need to JOIN both tables.
  • Also current MasterSettings store same keyterm for multiple checklists/playbooks.
    • meaning if a keyterm is changed but for one checklists it will change for others too
    • to prevent such scenarios, funny strategies need to be implemented in code
    • conclusion unnecessary complexity is introduced.

complex queries for simple things

# Current: Get checklist with all items (3 queries + manual merge)
checklist = ChecklistMetadata.objects.get(checklist_id=checklist_id)
checklist_items = ChecklistItem.objects.filter(checklist_id=checklist_id)
keyterm_ids = [item["keyterm_id_id"] for item in checklist_items]
keyterms = MasterSettingsModel.objects.filter(RowKey__in=keyterm_ids)
# Then manually merge...

is_active for checklists

  • can be null/none - no active checklist found
  • race conditions when switching active checklist
  • scope - active for which team?? which contract type???
  • can delete active checklist accidentally

Goals

  • Reduce Joins and Complexity
  • Versioning: Full history for keyterms, checklists, and contracts

Document (File Storage)

class Document():
    class DocumentType(models.TextChoices):
        CONTRACT = "CONTRACT", "Contract"
        CHECKLIST = "CHECKLIST", "Checklist"
        PLAYBOOK = "PLAYBOOK", "Playbook"
    
    class FileType(models.TextChoices):
        DOC = "DOC", "Word Document (Legacy)"
        PDF = "PDF", "PDF"
        DOCX = "DOCX", "Word Document"
        CSV = "CSV", "CSV"
        XLSX = "XLSX", "Excel"
    
    id = models.UUIDField(db_index=True)
     
    organization = models.ForeignKey(
        "Organization",
        on_delete=models.CASCADE,
        related_name="documents"
    )
    user = models.ForeignKey(
        "User",
        on_delete=models.SET_NULL,
    )

    document_type = models.CharField(
        max_length=20,
        choices=DocumentType.choices
    ) name = models.CharField(max_length=500) blob_url = models.URLField(max_length=2000) file_type = models.CharField(max_length=20, choices=FileType.choices)
    size_bytes = models.BigIntegerField()
    total_pages = models.IntegerField(null=True, blank=True)
    
    # versioning ezzz
    version = models.IntegerField(default=1)
    previous_version = models.ForeignKey(
        "self",
        on_delete=models.SET_NULL,
        null=True,
        blank=True,
        related_name="next_versions"
    )
    is_current = models.BooleanField(default=True)
    
    def create_new_version(
        self,
        blob_url: str,
        uploaded_by,
        **file_metadata
    ) -> "Document":
        self.is_current = False
        self.save(update_fields=["is_current", "updated_at"])
        
        return Document.objects.create(
            id=self.id,
            previous_version=self,
            version=self.version + 1,
            is_current=True,
            organization=self.organization,
            uploaded_by=uploaded_by,
            document_type=self.document_type,
            name=file_metadata.get("name", self.name),
            blob_url=blob_url,
            file_type=file_metadata.get("file_type", self.file_type),
            size_bytes=file_metadata.get("size_bytes", self.size_bytes),
            total_pages=file_metadata.get("total_pages"),
        )
    
    @classmethod
    def get_current(cls, id: str) -> "Document":
        return cls.objects.get(original_id=original_id, is_current=True)
    
    @classmethod
    def get_version_history(cls, original_id: str):
        return cls.objects.filter(original_id=original_id).order_by("version")
    
    @property
    def size_kb(self) -> float:
        return self.size_bytes / 1024

Checklist

class Checklist():
    id = models.UUIDField()

    name = models.CharField(max_length=200)
    description = models.TextField(blank=True)

    document = models.ForeignKey("Document", on_delete=models.SET_NULL)

    organization = models.ForeignKey(
        "Organization",
        on_delete=models.CASCADE,
    )

    created_by = models.ForeignKey(
        "User",
        on_delete=models.SET_NULL,
        null=True,
    )
    

Keyterm

class Keyterm():
    class KeytermType(models.TextChoices):
        ANALYSIS = "ANALYSIS", "Analysis"
        CHECKLIST = "CHECKLIST", "Checklist"
        METADATA = "METADATA", "Metadata"
    
    id = models.UUIDField(db_index=True)

    key = models.CharField(max_length=500)
    question = models.TextField()
    instructions = models.TextField(blank=True)
    
    organization = models.ForeignKey(
        "Organization",
        on_delete=models.CASCADE,
    )
     
    type = models.CharField(
        choices=KeytermType.choices,
        default=KeytermType.ANALYSIS
    )

    expected_answer = models.TextField(null=True, blank=True)

Contract

class Contract():
    class AnalysisStatus(models.TextChoices):
        PENDING = "PENDING", "Pending"
        ACCEPTED = "ACCEPTED", "Accepted"
        REJECTED = "REJECTED", "Rejected"

    class ContractType(models.TextChoices):
        MSA = "MSA", "msa"
        NDA = "NDA", "NDA"
        # .... many more
    
    document = models.OneToOneField(
        "Document",
        on_delete=models.CASCADE,
        related_name="contract"
    )
    
    organization = models.ForeignKey(
        "Organization",
        on_delete=models.CASCADE,
        related_name="contracts"
    )

    contract_type = models.CharField(choices=ContractType)
    
    analysis_status = models.CharField(
        choices=AnalysisStatus.choices,
        default=AnalysisStatus.PENDING
    )
    
    analyzed_at = models.DateTimeField(null=True, blank=True)
    analyzed_by = models.ForeignKey(
        "User",
        on_delete=models.SET_NULL,
        null=True,
        related_name="analyzed_contracts"
    )

Analysis MODEL

  • large unstructured textual data
  • Also while fetching might need to fetch multiple rows of data for same contract
  • Hence it makes sense to store this data in the Mongo DB document storage.
    • Easy to store unstructured data because its just JSON
    • Easy to query analysis for any contract

Contract specific data

Problem: COI has tenant_name, property_name etc. MSA has parties, term_length. different contracts have different fields.

Options:

  • One table with all nullable fields, but it will be sparse and kinda messy
  • Separate table per type (COI, MSA) -> lmaoo more schema tables than users
  • JSON field -> flexible but no validations + gets messy after short amount of time

SOLUTION:

class Contract():
    document = models.OneToOneField("Document", ...)
    organization = models.ForeignKey("Organization", ...)
    contract_type = models.CharField(choices=ContractType)
    analysis_status = models.CharField(...)
    ...


class COIContract():
    contract = models.OneToOneField(Contract, on_delete=models.CASCADE, related_name="coi_details")
    tenant_name = models.CharField()
    property_name = models.CharField()
    property_unit = models.CharField()
    expiry_date = models.DateField()
    # ..... ezzzzz


class MSAContract():
    contract = models.OneToOneField(Contract, on_delete=models.CASCADE, related_name="msa_details")
    party_a = models.CharField(max_length=500)
    party_b = models.CharField(max_length=500)
    term_months = models.IntegerField(null=True)
    auto_renewal = models.BooleanField(default=False)

Benefits:

  • Contract table stays clean
  • type specific data is normalized
  • no changes to existing code when adding new types, adding new contract type = new detail table + new handler
  • lastly, easy to test handlers in isolation

Concern: Separate tables per contract type could lead to "schema explosion" BUT....

  • Current system has 10-15 distinct contract categories
  • Many categories share similar metadata structures
  • Most contract types might only need generic metadata storage

Strategy Pattern for structuring code.

class IContractHandler():
    def create_metadata(self, contract: Contract, metadata: dict) -> Any: ...
    def get_metadata(self, contract: Contract) -> dict: ...

class COIHandler:
    def create_metadata(self, contract: Contract, metadata: dict) -> COIDetails:
        return COIDetails.objects.create(
            contract=contract,
            tenant_name=metadata.get("tenant_name", ""),
            property_name=metadata.get("property_name", ""),
            property_unit=metadata.get("property_unit", ""),
            expiry_date=metadata.get("expiry_date"),
        )
    
    def get_metadata(self, contract: Contract) -> dict:
        details = contract.coi_details
        return {
            "tenant_name": details.tenant_name,
            "property_name": details.property_name,
            "expiry_date": details.expiry_date,
        }

    # ... similar methods
class ContractService:
    def __init__(self, ...):
    
    @transaction.atomic
    def create_contract(self, file: BinaryIO, team_id: str = None, **metadata) -> Contract:
        document = self.document_service.upload(file=file, ...)
        contract_type = self._detect_contract_type(document)
        
        contract = Contract.objects.create(
            document=document,
            organization=self.organization,
            contract_type=contract_type,
        )
        
        # implementation of strategy pattern in actual methods
        handler = self.handler.get(contract_type)
        handler.create_details(contract, metadata)
        
        return contract

Dependency Injection

class IDocumentService(Protocol):
    def upload(self, file: BinaryIO, document_type: str) -> Document: ...
    def upload_new_version(self, document_id: str, file: BinaryIO) -> Document: ...

class IChecklistService(Protocol):
    def get_active_checklist(self, organization_id: str, contract_type: str, team_id: str = None) -> Checklist: ...

class IContractService(Protocol):
    def create_contract(self, file: BinaryIO, team_id: str = None, **metadata) -> Contract: ...
    def replace_document(self, contract_id: str, file: BinaryIO) -> Contract: ...
class DocumentService:
    def __init__(self, organization: Organization, user: User):
        self.organization = organization
        self.user = user
    
    def upload(self, file: BinaryIO, document_type: str) -> Document:
        pass
    
    def upload_new_version(self, document_id: str, file: BinaryIO) -> Document:
        pass

class ContractService:
    def __init__(
        self,
        organization: Organization,
        user: User,
        document_service: IDocumentService,
        checklist_service: IChecklistService
    ):
        self.organization = organization
        self.user = user
        self.document_service = document_service
        self.checklist_service = checklist_service
    
    @transaction.atomic
    def create_contract(self, file: BinaryIO, team_id: str = None, **metadata) -> Contract:
        document = self.document_service.upload(
            file=file,
            document_type=Document.DocumentType.CONTRACT,
        )
        
        contract_type = self._detect_contract_type(document)
        
        checklist = self.checklist_service.get_active_checklist(
            organization_id=str(self.organization.id),
            contract_type=contract_type,
            team_id=team_id
        )
        
        return Contract.objects.create(
            document=document,
            organization=self.organization,
            checklist=checklist,
            contract_type=contract_type,
            **metadata
        )  
class ServiceFactory:
    def __init__(self, organization: Organization, user: User):
        self.organization = organization
        self.user = user
        self._document_service = None
        self._checklist_service = None
    
    @property
    def document_service(self) -> DocumentService:
        if not self._document_service:
            self._document_service = DocumentService(self.organization, self.user)
        return self._document_service
    
    @property
    def checklist_service(self) -> ChecklistService:
        if not self._checklist_service:
            self._checklist_service = ChecklistService(self.organization)
        return self._checklist_service
    
    def contract_service(self) -> ContractService:
        return ContractService(
            organization=self.organization,
            user=self.user,
            document_service=self.document_service,
            checklist_service=self.checklist_service
        )

usage in views or anything

@api_view(["POST"])
@permission_classes([IsAuthenticated])
def upload_contract(request):
    user = request.user
    organization = user.organization
    
    factory = ServiceFactory(organization, user)
    contract_service = factory.contract_service()
    
    contract = contract_service.create_contract(
        file=request.FILES["file"],
    )
    
    return Response({"contract_id": str(contract.id)})