Source code for database.models

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri May 21 18:09:30 2021

@author: Paolo Cozzi <paolo.cozzi@ibba.cnr.it>
"""

import logging

from enum import Enum

from .db import db, DB_ALIAS

# Get an instance of a logger
logger = logging.getLogger(__name__)


[docs] def complement(genotype: str): bases = { "A": "T", "T": "A", "C": "G", "G": "C", "/": "/" } result = "" for base in genotype: result += bases[base] return result
[docs] class SmarterDBException(Exception): pass
[docs] class SmarterInfo(db.Document): """A class to track database status informations""" id = db.StringField(primary_key=True) version = db.StringField(required=True) working_assemblies = db.DictField() plink_specie_opt = db.DictField() last_updated = db.DateTimeField() meta = { 'db_alias': DB_ALIAS, 'collection': 'smarterInfo' } def __str__(self): return f"{self.id}: {self.version} ({self.last_updated})"
[docs] class Country(db.Document): """A helper class to deal with countries object. Each record is created after data import, when database status is updated""" alpha_2 = db.StringField( required=True, unique=True, min_length=2, max_length=2) alpha_3 = db.StringField( required=True, unique=True, min_length=3, max_length=3) name = db.StringField(required=True, unique=True) numeric = db.IntField(required=True, unique=True) official_name = db.StringField() species = db.ListField(db.StringField()) meta = { 'db_alias': DB_ALIAS, 'collection': 'countries' } def __str__(self): return f"{self.name} ({self.alpha_2})"
[docs] class SupportedChip(db.Document): name = db.StringField(required=True, unique=True) species = db.StringField(required=True) manufacturer = db.StringField() n_of_snps = db.IntField(default=0) meta = { 'db_alias': DB_ALIAS, 'collection': 'supportedChips' } def __str__(self): return f"'{self.name}' ({self.species})"
[docs] class BreedAlias(db.EmbeddedDocument): fid = db.StringField(required=True) dataset = db.ReferenceField( 'Dataset', db_field="dataset_id") country = db.StringField() def __str__(self): return f"{self.fid}: {self.dataset}"
[docs] class Breed(db.Document): species = db.StringField(required=True) name = db.StringField(required=True) code = db.StringField(required=True) aliases = db.ListField( db.EmbeddedDocumentField(BreedAlias)) n_individuals = db.IntField() meta = { 'db_alias': DB_ALIAS, 'collection': 'breeds', 'indexes': [ { 'fields': [ "species", "code" ], 'unique': True, 'collation': {'locale': 'en', 'strength': 1} }, { 'fields': [ "species", "name" ], 'unique': True, 'collation': {'locale': 'en', 'strength': 1} } ] } def __str__(self): return f"{self.name} ({self.code}) {self.species}"
[docs] class Dataset(db.Document): """Describe a dataset instace with fields owned by data types""" file = db.StringField(required=True, unique=True) uploader = db.StringField() size_ = db.StringField(db_field="size") partner = db.StringField() # HINT: should country, species and breeds be a list of items? country = db.StringField() species = db.StringField() breed = db.StringField() n_of_individuals = db.IntField() n_of_records = db.IntField() trait = db.StringField() gene_array = db.StringField() # add type tag type_ = db.ListField(db.StringField(), db_field="type") # file contents contents = db.ListField(db.StringField()) # track the original chip_name with dataset chip_name = db.StringField() doi = db.URLField() """The publication DOI of this dataset""" meta = { 'db_alias': DB_ALIAS, 'collection': 'dataset' } def __str__(self): return f"file={self.file}, uploader={self.uploader}"
[docs] class SEX(bytes, Enum): UNKNOWN = (0, "Unknown") MALE = (1, "Male") FEMALE = (2, "Female") def __new__(cls, value, label): obj = bytes.__new__(cls, [value]) obj._value_ = value obj.label = label return obj def __str__(self): return self.label
[docs] class Phenotype(db.DynamicEmbeddedDocument): """A class to deal with Phenotype. A dynamic document and not a generic DictField since that there can be attributes which could be enforced to have certain values. All other attributes could be set without any assumptions """ purpose = db.StringField() chest_girth = db.FloatField() height = db.FloatField() length = db.FloatField() def __str__(self): return f"{self.to_json()}"
[docs] class SAMPLETYPE(Enum): FOREGROUND = 'foreground' BACKGROUND = 'background'
[docs] class SampleSpecies(db.Document): original_id = db.StringField(required=True) smarter_id = db.StringField(required=True, unique=True) country = db.StringField(required=True) # generic species type (required to derive other stuff) species_class = None breed = db.StringField(required=True) breed_code = db.StringField(min_length=2) # this will be a original_id alias (a different sample name in original # data file) alias = db.StringField() # required to search a sample relying only on original ID dataset = db.ReferenceField( Dataset, db_field="dataset_id", reverse_delete_rule=db.DENY ) # add type tag type_ = db.EnumField(SAMPLETYPE, db_field="type", required=True) # track the original chip_name with sample chip_name = db.StringField() # define enum types for sex sex = db.EnumField(SEX) # GPS location # NOTE: X, Y where X is longitude, Y latitude locations = db.MultiPointField( auto_index=True, default=None) # additional (not modelled) metadata metadata = db.DictField(default=None) # for phenotypes phenotype = db.EmbeddedDocumentField(Phenotype, default=None) meta = { 'abstract': True, 'indexes': [ [("locations", "2dsphere")] ] } def __str__(self): return f"{self.smarter_id} ({self.breed})"
[docs] class SampleSheep(SampleSpecies): species = db.StringField(required=True, default="Ovis aries") # generic species type (required to derive other stuff) species_class = "Sheep" # try to model relationship between samples father_id = db.LazyReferenceField( 'SampleSheep', passthrough=True, reverse_delete_rule=db.NULLIFY ) mother_id = db.LazyReferenceField( 'SampleSheep', passthrough=True, reverse_delete_rule=db.NULLIFY ) meta = { 'db_alias': DB_ALIAS, 'collection': 'sampleSheep' }
[docs] class SampleGoat(SampleSpecies): species = db.StringField(required=True, default="Capra hircus") # generic species type (required to derive other stuff) species_class = "Goat" # try to model relationship between samples father_id = db.LazyReferenceField( 'SampleGoat', passthrough=True, reverse_delete_rule=db.NULLIFY ) mother_id = db.LazyReferenceField( 'SampleGoat', passthrough=True, reverse_delete_rule=db.NULLIFY ) meta = { 'db_alias': DB_ALIAS, 'collection': 'sampleGoat' }
[docs] class Consequence(db.EmbeddedDocument): pass
[docs] class Location(db.EmbeddedDocument): ss_id = db.StringField() version = db.StringField(required=True) chrom = db.StringField(required=True) position = db.IntField(required=True) alleles = db.StringField() illumina = db.StringField(required=True) illumina_forward = db.StringField() illumina_strand = db.StringField() affymetrix_ab = db.StringField() strand = db.StringField() imported_from = db.StringField(required=True) # this could be the manifactured date or the last updated date = db.DateTimeField() consequences = db.ListField( db.EmbeddedDocumentField(Consequence), default=None) def __init__(self, *args, **kwargs): illumina_top = None # remove illumina top from arguments if 'illumina_top' in kwargs: illumina_top = kwargs.pop('illumina_top') # initialize base object super(Location, self).__init__(*args, **kwargs) # fix illumina top if necessary if illumina_top: self.illumina_top = illumina_top @property def illumina_top(self): """Return genotype in illumina top format""" if self.illumina_strand in ['BOT', 'bottom']: return complement(self.illumina) elif (not self.illumina_strand or self.illumina_strand in ['TOP', 'top']): return self.illumina else: raise SmarterDBException( f"{self.illumina_strand} not managed") @illumina_top.setter def illumina_top(self, genotype: str): if (not self.illumina_strand or self.illumina_strand in ['TOP', 'top']): self.illumina = genotype elif self.illumina_strand in ['BOT', 'bottom']: self.illumina = complement(genotype) else: raise SmarterDBException( f"{self.illumina_strand} not managed") def __str__(self): return ( f"({self.imported_from}:{self.version}) " f"{self.chrom}:{self.position} [{self.illumina_top}]" )
[docs] class Probeset(db.EmbeddedDocument): chip_name = db.StringField(required=True) # more probe could be assigned to the same SNP probeset_id = db.ListField(db.StringField()) def __str__(self): return ( f"{self.chip_name}: {self.probeset_id}" )
[docs] class VariantSpecies(db.Document): rs_id = db.ListField(db.StringField(), default=None) chip_name = db.ListField(db.StringField()) name = db.StringField(unique=True) # sequence should model both illumina or affymetrix sequences sequence = db.DictField() # illumina top variant at variant level illumina_top = db.StringField(required=True) locations = db.ListField( db.EmbeddedDocumentField(Location)) # HINT: should sender be a Location attribute? sender = db.StringField() # Affymetryx specific fields probesets = db.ListField( db.EmbeddedDocumentField(Probeset), default=None) affy_snp_id = db.StringField() cust_id = db.StringField() # abstract class with custom indexes # TODO: need a index for position (chrom, position, version) meta = { 'abstract': True, 'indexes': [ { 'fields': [ "locations.chrom", "locations.position" ], }, { 'fields': ["affy_snp_id"], 'partialFilterExpression': { "affy_snp_id": { "$exists": True } } }, "probesets.probeset_id", 'rs_id', ] } def __str__(self): if not self.name and self.affy_snp_id: return ( f"affy_snp_id='{self.affy_snp_id}', rs_id='{self.rs_id}', " f"illumina_top='{self.illumina_top}'") return ( f"name='{self.name}', rs_id='{self.rs_id}', " f"illumina_top='{self.illumina_top}'")
[docs] def to_mongo(self, *args, **kwargs): """Override flask-mongoengine method""" data = super().to_mongo(*args, **kwargs) # add illumina_top property to locations for i, location in enumerate(data['locations']): data['locations'][i]['illumina_top'] = \ self.locations[i].illumina_top return data
[docs] class VariantSheep(VariantSpecies): meta = { 'db_alias': DB_ALIAS, 'collection': 'variantSheep' }
[docs] class VariantGoat(VariantSpecies): meta = { 'db_alias': DB_ALIAS, 'collection': 'variantGoat' }