Selection plus FunPDBe Study Case of Q13426

cover

pip install --upgrade pdb-profiling

Import Packages & Settings

from pdb_profiling import default_config
default_config("C:/GitWorks/pdb-profiling/test/demo")

from pdb_profiling.processors import SIFTS, PDB, Base
from pdb_profiling.utils import DisplayPDB

from tqdm.notebook import tqdm
from pandas import concat, DataFrame
# 链层面筛选过滤 (Chain Level Filtering)
# 设置SISTS.chain_filter条件: UNK_COUNT < SEQRES_COUNT,下面展示默认值(Default value is shown below)
SIFTS.chain_filter = '''
    UNK_COUNT < SEQRES_COUNT
    and ca_p_only == False
    and identity >=0.9
    and repeated == False
    and reversed == False
    and OBS_COUNT > 20'''

valid filters:

Column NameTypeExplaination
identityfloatprovided by SIFTS: sequence identity of PDB Entity SEQRES with UniProt Isoform (0-1)
is_canonicalboolwhether the UniProt Isoform is the canonical isoform defined by UniProt-KB
sifts_range_tagstrSafe or Insertion or Deletion or InDel (example)
reversedboolwhether there is reversed mapped range in the aspect of UniProt Isoform Sequence (example)
repeatedboolwhether there is repeated mapped range in the aspect of UniProt Isoform Sequence (example)
InDel_sumintSEQRES residues that fall into the range of Insertion or Deletion or InDel of the PDB Chain Instance
unp_lenintthe length of the UniProt Isoform Sequence
BINDING_LIGAND_COUNTintthe residues that binding to ligands(including carbohydrate polymer) of the PDB Chain Instance
OBS_COUNTintthe observed/modelled (with coordinates) residues of the PDB Chain Instance
OBS_RATIO_SUMfloatthe sum of the observed/modelled (with coordinates) residues’s ratio of the PDB Chain Instance
NON_COUNTintthe count non-standard residues of the PDB Entity (including UNK)
SEQRES_COUNTintthe count of the residues in SEQRES
STD_COUNTintthe count of the standard residues of the PDB Entity
UNK_COUNTintthe count of the UNK residues of the PDB Entity
ca_p_onlyboolwhether the PDB Entity only contains C-alpha atom for each residue
OBS_STD_COUNTintthe count of the observed standard residues of the PDB Chain Instance
# PDB条目层面筛选过滤(Entry Level Filtering)
# 设置SISTS.entry_filter条件,下面展示默认值(Default value is shown below)
SIFTS.entry_filter = '''
    (experimental_method in ["X-ray diffraction", "Electron Microscopy"] and resolution <= 3) or 
    experimental_method == "Solution NMR"
    '''

valid filters:

Column NameTypeExplanation
resolutionfloat/nan(pdb-101-explanation)
experimental_method_classstr(pdb-101-explanation)
experimental_methodstrx-ray, nmr, em, other
multi_methodboolwhether the PDB entry was determined by multiple method
revision_datedateas name said
deposition_datedateas name said
demo = SIFTS('Q13426')

Select Monomeic Protein

Implement PDBe RESTful API (PDBe Entry & SIFTS)

%time df1 = demo.pipe_select_mo().result()
df1[df1.select_tag.eq(True)].T
Wall time: 654 ms

4
UniProtQ13426
chain_idA
entity_id1
identity0.99
is_canonicalTrue
pdb_id3ii6
struct_asym_idA
pdb_range[[1,203]]
unp_range[[1,203]]
EntryQ13426
range_diff[0]
sifts_range_tagSafe
repeatedFalse
reversedFalse
InDel_sum0
new_pdb_range[[1,203]]
new_unp_range[[1,203]]
conflict_pdb_index{"60":"A","134":"I"}
conflict_pdb_range[[60,60],[134,134]]
conflict_unp_range[[60,60],[134,134]]
unp_len336
BINDING_LIGAND_COUNT0
BINDING_LIGAND_INDEX[]
OBS_COUNT201
OBS_INDEX[[1, 201]]
OBS_RATIO_SUM201
ARTIFACT_INDEX[]
NON_COUNT0
NON_INDEX[]
SEQRES_COUNT203
STD_COUNT203
STD_INDEX[[1, 203]]
UNK_COUNT0
UNK_INDEX[]
ca_p_onlyFalse
molecule_typepolypeptide(L)
OBS_STD_INDEX((1, 201),)
OBS_STD_COUNT201
RAW_BS0.587555
RAW_BS_IG30.587555
resolution2.4
experimental_method_classx-ray
experimental_methodX-ray diffraction
multi_methodFalse
revision_date20110713
deposition_date20090731
1/resolution0.416667
id_score-65
select_tagTrue
select_rank1
DisplayPDB(dark=True).show('3ii6', range(1,3))
Asymmetric unit of 3ii6Biological assembly 1 of 3ii6Biological assembly 2 of 3ii6

Prepare for Residue-Level Mapping

record = df1[df1.select_tag.eq(True)].iloc[0]

mapping_df = PDB(record['pdb_id']).get_expanded_map_res_df(
    record['UniProt'], 
    record['new_unp_range'], 
    record['new_pdb_range'], 
    struct_asym_id=record['struct_asym_id']).result()

mapping_df

unp_residue_numberresidue_numberUniProtauthor_insertion_codeauthor_residue_numberchain_identity_idmultiple_conformersobserved_ratiopdb_idresidue_namestruct_asym_id
011Q134261A1NaN13ii6META
122Q134262A1NaN13ii6GLUA
233Q134263A1NaN13ii6ARGA
344Q134264A1NaN13ii6LYSA
455Q134265A1NaN13ii6ILEA
.......................................
198199199Q13426199A1NaN13ii6LEUA
199200200Q13426200A1NaN13ii6ASNA
200201201Q13426201A1NaN13ii6ALAA
201202202Q13426202A1NaN03ii6ALAA
202203203Q13426203A1NaN03ii6GLNA

203 rows × 12 columns

Detecting Homomeric Interaction

also annotated by PISA & Interactome3D

from pdb_profiling.processors.i3d.api import Interactome3D

Interactome3D.pipe_init_interaction_meta().result()
%time df2 = demo.pipe_select_ho(run_as_completed=True, progress_bar=tqdm).result()
df2[df2.i_select_tag.eq(True)]
HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

Wall time: 2.47 s

entity_id_1chain_id_1struct_asym_id_1struct_asym_id_in_assembly_1asym_id_rank_1model_id_1molecule_type_1surface_range_1interface_range_1entity_id_2...select_rank_2in_i3dunp_range_DSCbest_select_rank_scoresecond_select_rank_scoreunp_interface_range_1unp_interface_range_2i_groupi_select_tagi_select_rank
01AAA11polypeptide(L)[[15,35],[37,47],[49,108],[110,126],[128,227]][[19,21],[30,31],[33,33],[51,54],[132,134],[13...1...4True1.00.2500000.250000((5, 7), (16, 17), (19, 19), (37, 40), (118, 1...((5, 7), (16, 17), (19, 19), (37, 40), (118, 1...(Q13426, Q13426)True10
71AAA11polypeptide(L)[[1,21],[23,201]][[11,16],[89,91],[103,103]]1...7False1.01.0000000.142857((11, 16), (89, 91), (103, 103))((1, 1), (3, 3), (25, 25), (121, 121), (124, 1...(Q13426, Q13426)True4
81BBB11polypeptide(L)[[1,19],[21,35],[37,76],[82,94],[96,201]][[117,118],[121,121],[124,124]]1...7False1.00.2000000.142857((117, 118), (121, 121), (124, 124))((117, 118), (121, 121), (124, 124))(Q13426, Q13426)True11
101AAA11polypeptide(L)[[1,21],[23,201]][[5,7],[15,17],[19,19],[37,40],[119,121],[123,...1...5True1.01.0000000.200000((5, 7), (15, 17), (19, 19), (37, 40), (119, 1...((5, 7), (16, 17), (19, 19), (38, 40), (117, 1...(Q13426, Q13426)True2
111AAA11polypeptide(L)[[1,21],[23,201]][[7,7],[9,9],[14,15],[17,17],[19,19],[80,80]]1...2False1.01.0000000.500000((7, 7), (9, 9), (14, 15), (17, 17), (19, 19),...((7, 7), (9, 9), (14, 15), (17, 17), (80, 80))(Q13426, Q13426)True1
161AAAA22polypeptide(L)[[1,35],[37,178]][[57,62],[65,65],[98,98],[101,107]]1...6True1.00.1666670.111111((57, 62), (65, 65), (98, 98), (101, 107))((1, 1), (3, 3), (23, 25), (30, 33), (46, 46),...(Q13426, Q13426)True15
211BBB11polypeptide(L)[[1,17],[19,33],[35,35],[37,41],[43,203]][[145,145],[148,149],[152,152],[155,156],[158,...1...6True1.00.1666670.166667((145, 145), (148, 149), (152, 152), (155, 156...((145, 145), (148, 149), (152, 152), (155, 156...(Q13426, Q13426)True12
221AAA11polypeptide(L)[[1,178]][[7,7],[9,9],[15,17]]1...9True1.00.1111110.111111((7, 7), (9, 9), (15, 17))((57, 57), (62, 62), (64, 64))(Q13426, Q13426)True21
231AAA11polypeptide(L)[[1,178]][[166,166],[169,170],[173,174]]1...9False1.00.1111110.111111((166, 166), (169, 170), (173, 174))((166, 166), (169, 170), (173, 174))(Q13426, Q13426)True22

9 rows × 114 columns

Detecting Heteromeric Interaction

also annotated by PISA & Interactome3D

%time df3 = demo.pipe_select_he(run_as_completed=True, progress_bar=tqdm).result()
df3[df3.i_select_tag.eq(True)]
HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))

Wall time: 2.23 s

entity_id_1chain_id_1struct_asym_id_1struct_asym_id_in_assembly_1asym_id_rank_1model_id_1molecule_type_1surface_range_1interface_range_1entity_id_2...select_tag_2select_rank_2in_i3dbest_select_rank_scoresecond_select_rank_scoreunp_interface_range_1unp_interface_range_2i_groupi_select_tagi_select_rank
101AAAA22polypeptide(L)[[15,33],[35,35],[37,47],[49,108],[110,126],[1...[[169,169],[172,173],[176,177],[179,180],[183,...2...True1True1.00.25((155, 155), (158, 159), (162, 163), (165, 166...((465, 466), (469, 470), (473, 473), (476, 477...(Q13426, Q0D2I5)True1
111AAAA22polypeptide(L)[[15,33],[35,35],[37,47],[49,108],[110,126],[1...[[169,169],[172,173],[176,177],[179,180],[183,...2...True1True1.00.25((155, 155), (158, 159), (162, 163), (165, 166...((105, 106), (109, 110), (113, 113), (116, 117...(Q13426, Q0D2I5-2)True1
121AAAA22polypeptide(L)[[15,33],[35,35],[37,47],[49,108],[110,126],[1...[[169,169],[172,173],[176,177],[179,180],[183,...2...True1True1.00.25((155, 155), (158, 159), (162, 163), (165, 166...((468, 469), (472, 473), (476, 476), (479, 480...(Q13426, Q0D2I5-4)True1
131AAAA22polypeptide(L)[[15,33],[35,35],[37,47],[49,108],[110,126],[1...[[169,169],[172,173],[176,177],[179,180],[183,...2...True1True1.00.25((155, 155), (158, 159), (162, 163), (165, 166...((469, 470), (473, 474), (477, 477), (480, 481...(Q13426, Q0D2I5-5)True1
141AAAA22polypeptide(L)[[15,33],[35,35],[37,47],[49,108],[110,126],[1...[[169,169],[172,173],[176,177],[179,180],[183,...2...True1True1.00.25((155, 155), (158, 159), (162, 163), (165, 166...((106, 107), (110, 111), (114, 114), (117, 118...(Q13426, Q0D2I5-6)True1
151AAAA22polypeptide(L)[[15,33],[35,35],[37,47],[49,108],[110,126],[1...[[169,169],[172,173],[176,177],[179,180],[183,...2...True1True1.00.25((155, 155), (158, 159), (162, 163), (165, 166...((468, 469), (472, 473), (476, 476), (479, 480...(Q13426, Q0D2I5-7)True1
411CCC11polypeptide(L)[[1,21],[23,201]][[150,150],[153,154],[157,158],[161,161],[164,...2...True1True1.00.50((150, 150), (153, 154), (157, 158), (161, 161...((763, 771), (774, 775), (778, 778), (800, 800...(Q13426, P49917)True1

7 rows × 117 columns

Collecting Residue-Level Annotation From FunPDBe via PDBe Graph API

PDBe-KB consortium, PDBe-KB: a community-driven resource for structural and functional annotations, Nucleic Acids Research, Volume 48, Issue D1, 08 January 2020, Pages D344–D353, https://doi.org/10.1093/nar/gkz853

Partner resource (Reference)Resource leaderType of annotationsNumber of PDB entries
COSPI-Depth (21)M. S. MadhusudhanResidue depth141 097
P2rank (6)D. HokszaBinding site predictions138 892
Arpeggio (15)T. BlundellLigand interactions117 023
3DComplex (14)E. D. LevyInteraction interfaces111 555
DynaMine (19)W. VrankenBackbone flexibility predictions98 548
POPSCOMP (20)F. FraternaliSolvent accessibility77 578
AKID (11)M. Helmer-CitterichKinase-target predictor41 492
ChannelsDB (9)R. SvobodovaMolecular channels25 351
CATH-FunSites (13)C. OrengoFunctional site predictions23 975
canSAR (7)B. al-LazikaniDruggable pocket predictions17 804
FoldX (17)L. SerranoEnergetic consequences of mutations3778
ProKinO (10)N. KannanCurated regulatory sites3673
14–3-3-Pred (12)G. BartonBinding site predictions1941
CaMKinet (in preparation)M. KumarCurated PTM sites1076
M-CSA (5)J. ThorntonCurated catalytic sites919
3DLigandSite (8)M. WassBinding site predictions910
Missense3D (18)M. SternbergMutations in Human Proteome0*
MetalPDB (16)A. RosatoCurated metal binding sites0*
ELM (24)T. GibsonShort linear motifs0*
pdb_ob = PDB(record['pdb_id'])
pdb_ob
<PDB 3ii6>
funpdbe_df = pdb_ob.fetch_from_pdbe_api('graph-api/pdb/funpdbe_annotation/', Base.to_dataframe).result()
funpdbe_df[funpdbe_df.chain_id.eq(record['chain_id'])]

author_insertion_codeauthor_residue_numberchain_idchem_comp_idconfidence_classificationconfidence_scoreentity_idevidence_codeslabeloriginpdb_idraw_scoreresidue_numbersite_id
01AMETNaN0.51['ECO_0000364', 'ECO_0000203']backbonedynamine3ii60.76500011
12AGLUNaN0.51['ECO_0000364', 'ECO_0000203']backbonedynamine3ii60.77300021
23AARGNaN0.51['ECO_0000364', 'ECO_0000203']backbonedynamine3ii60.78400031
34ALYSNaN0.51['ECO_0000364', 'ECO_0000203']backbonedynamine3ii60.78800041
45AILENaN0.51['ECO_0000364', 'ECO_0000203']backbonedynamine3ii60.79200051
.............................................
9282161AARGhighNaN1['ECO_0000006', 'ECO_0000088']DiseaseFoldX3ii61.1245401611
928356AALAhighNaN1['ECO_0000006', 'ECO_0000088']PolymorphismFoldX3ii62.279820562
928412ASERhighNaN1['ECO_0000006', 'ECO_0000088']PolymorphismFoldX3ii60.314961123
928543ATRPhighNaN1['ECO_0000006', 'ECO_0000088']DiseaseFoldX3ii62.777570434
9286142AGLUhighNaN1['ECO_0000006', 'ECO_0000088']PolymorphismFoldX3ii6-0.1969691425

1613 rows × 14 columns

Collecting Chain|Residue-Level Functional Annotation From SIFTS API | PDBe Graph API

Jose M Dana, Aleksandras Gutmanas, Nidhi Tyagi, Guoying Qi, Claire O’Donovan, Maria Martin, Sameer Velankar, SIFTS: updated Structure Integration with Function, Taxonomy and Sequences resource allows 40-fold increase in coverage of structure-based annotations for proteins, Nucleic Acids Research, Volume 47, Issue D1, 08 January 2019, Pages D482–D489, https://doi.org/10.1093/nar/gky1114

Structure Integration with Function, Taxonomy and Sequence (SIFTS) is a project in the PDBe-KB resource for residue-level mapping between UniProt and PDB entries. SIFTS also provides annotation from the IntEnz, GO, InterPro, Pfam, CATH, SCOP, PubMed, Ensembl and Homologene resources. The information is updated and released every week concurrently with the release of new PDB entries and is widely used by resources such as RCSB PDB, PDBj, PDBsum, Pfam, SCOP and InterPro.

  • api/mappings/ or graph-api/mappings/

    • api/mappings/sequence_domains/
      • NOTE: (interpro+pfam)
      • api/mappings/interpro/
      • api/mappings/pfam/
    • api/mappings/structural_domains/
      • NOTE: (scop+cath)
      • api/mappings/scop/
      • api/mappings/cath/
    • api/mappings/cath_b/
    • api/mappings/go/ (chain-level)
    • api/mappings/ec/ (chain-level)
    • api/mappings/hmmer/
  • api/pdb/entry/secondary_structure/

  • graph-api/pdb/sequence_conservation/

pdb_ob.fetch_from_pdbe_api('api/mappings/interpro/', Base.to_dataframe).result().query('chain_id == "{}"'.format(record['chain_id']))

InterProchain_idendentity_ididentifiernamepdb_idstartstruct_asym_id
0IPR009089A{"author_residue_number":117,"author_insertion...1XRCC4, N-terminal domain superfamilyXRCC4, N-terminal domain superfamily3ii6{"author_residue_number":1,"author_insertion_c...A
22IPR010585A{"author_residue_number":200,"author_insertion...1DNA repair protein XRCC4DNA repair protein XRCC43ii6{"author_residue_number":1,"author_insertion_c...A
23IPR010585A{"author_residue_number":201,"author_insertion...1DNA repair protein XRCC4DNA repair protein XRCC43ii6{"author_residue_number":2,"author_insertion_c...A
pdb_ob.fetch_from_pdbe_api('api/mappings/pfam/', Base.to_dataframe).result().query('chain_id == "{}"'.format(record['chain_id']))

Pfamchain_idcoveragedescriptionendentity_ididentifiernamepdb_idstartstruct_asym_id
pdb_ob.fetch_from_pdbe_api('api/mappings/structural_domains/', Base.to_dataframe).result().query('chain_id == "{}"'.format(record['chain_id']))

CATHarchitecturechain_idclassdomainendentity_idhomologyidentifiernamepdb_idsegment_idstartstruct_asym_idtopology
41.20.5.370Up-down BundleAMainly Alpha3ii6A02{"author_residue_number":176,"author_insertion...1Single alpha-helices involved in coiled-coils ...Single alpha-helices involved in coiled-coils ...Dna repair protein xrcc4. Chain: a, b, c, d. F...3ii61{"author_residue_number":119,"author_insertion...ASingle alpha-helices involved in coiled-coils ...
82.170.210.10Beta ComplexAMainly Beta3ii6A01{"author_residue_number":118,"author_insertion...1DNA double-strand break repair and VJ recombin...Dna Repair Protein Xrcc4; Chain: A, domain 1Dna repair protein xrcc4. Chain: a, b, c, d. F...3ii61{"author_residue_number":1,"author_insertion_c...ADna Repair Protein Xrcc4; Chain: A, domain 1
pdb_ob.fetch_from_pdbe_api('api/mappings/cath_b/', Base.to_dataframe).result().query('chain_id == "{}"'.format(record['chain_id']))

CATH-Barchitecturechain_idclassdomainendentity_idhomologyidentifiernamepdb_idsegment_idstartstruct_asym_idtopology
41.20.5.370Up-down BundleAMainly Alpha3ii6A02{"author_residue_number":176,"author_insertion...1Single alpha-helices involved in coiled-coils ...Single alpha-helices involved in coiled-coils ...Dna repair protein xrcc4. Chain: a, b, c, d. F...3ii61{"author_residue_number":119,"author_insertion...ASingle alpha-helices involved in coiled-coils ...
82.170.210.10Beta ComplexAMainly Beta3ii6A01{"author_residue_number":118,"author_insertion...1DNA double-strand break repair and VJ recombin...Dna Repair Protein Xrcc4; Chain: A, domain 1Dna repair protein xrcc4. Chain: a, b, c, d. F...3ii61{"author_residue_number":1,"author_insertion_c...ADna Repair Protein Xrcc4; Chain: A, domain 1
pdb_ob.fetch_from_pdbe_api('api/mappings/go/', Base.to_dataframe).result().query('chain_id == "{}"'.format(record['chain_id']))

GOcategorychain_iddefinitionentity_ididentifiernamepdb_idstruct_asym_id
0GO:0006310Biological_processAAny process in which a new genotype is formed ...1DNA recombinationDNA recombination3ii6A
4GO:0006302Biological_processAThe repair of double-strand breaks in DNA via ...1double-strand break repairdouble-strand break repair3ii6A
10GO:0005634Cellular_componentAA membrane-bounded organelle of eukaryotic cel...1nucleusnucleus3ii6A
18GO:0003677Molecular_functionAAny molecular function by which a gene product...1DNA bindingDNA binding3ii6A
pdb_ob.fetch_from_pdbe_api('api/mappings/ec/', Base.to_dataframe).result().query('chain_id == "{}"'.format(record['chain_id']))

ECaccepted_namechain_identity_ididentifierpdb_idreactionstruct_asym_idsynonymssystematic_name
pdb_ob.fetch_from_pdbe_api('api/mappings/hmmer/', Base.to_dataframe).result().query('chain_id == "{}"'.format(record['chain_id']))

HMMERchain_idcoveragedescriptionendentity_idhmm_endhmm_lengthhmm_startidentifiernamepdb_idstartstruct_asym_id
6PF06632A0.608DNA double-strand break repair and V(D)J recom...{"author_residue_number":200,"author_insertion...12053371DNA double-strand break repair and V(D)J recom...XRCC43ii6{"author_residue_number":1,"author_insertion_c...A
pdb_ob.fetch_from_pdbe_api('api/pdb/entry/secondary_structure/', Base.to_dataframe).result().query('chain_id == "{}"'.format(record['chain_id']))

chain_idendentity_idpdb_idsecondary_structuresheet_idstartstruct_asym_id
0A{"author_residue_number":59,"author_insertion_...13ii6helicesNaN{"author_residue_number":49,"author_insertion_...A
1A{"author_residue_number":75,"author_insertion_...13ii6helicesNaN{"author_residue_number":62,"author_insertion_...A
2A{"author_residue_number":201,"author_insertion...13ii6helicesNaN{"author_residue_number":118,"author_insertion...A
3A{"author_residue_number":10,"author_insertion_...13ii6strands1.0{"author_residue_number":3,"author_insertion_c...A
4A{"author_residue_number":23,"author_insertion_...13ii6strands1.0{"author_residue_number":13,"author_insertion_...A
5A{"author_residue_number":37,"author_insertion_...13ii6strands1.0{"author_residue_number":31,"author_insertion_...A
6A{"author_residue_number":44,"author_insertion_...13ii6strands1.0{"author_residue_number":42,"author_insertion_...A
7A{"author_residue_number":48,"author_insertion_...13ii6strands1.0{"author_residue_number":46,"author_insertion_...A
8A{"author_residue_number":88,"author_insertion_...13ii6strands2.0{"author_residue_number":84,"author_insertion_...A
9A{"author_residue_number":100,"author_insertion...13ii6strands2.0{"author_residue_number":94,"author_insertion_...A
10A{"author_residue_number":112,"author_insertion...13ii6strands2.0{"author_residue_number":105,"author_insertion...A
11A{"author_residue_number":115,"author_insertion...13ii6strands1.0{"author_residue_number":114,"author_insertion...A
seq_conser_df = pdb_ob.fetch_from_pdbe_api(
    'graph-api/pdb/sequence_conservation/', 
    Base.to_dataframe, 
    mask_id="%s/%s" % (record['pdb_id'], record['entity_id'])
).result()

seq_conser_df

conservation_scoreentity_idlengthletter_arraypdb_idproba_arrayresidue_number
001203["M","L","I","V","A","F","T","S","K","R","E","...3ii6[0.217,0.168,0.096,0.096,0.054,0.042,0.04,0.03...1
101203["E","D","K","S","N","A","Q","R","T","G","L","...3ii6[0.239,0.101,0.082,0.071,0.065,0.062,0.062,0.0...2
201203["R","K","E","T","S","A","Q","N","D","G","L","...3ii6[0.158,0.151,0.08,0.071,0.07,0.064,0.064,0.049...3
301203["K","S","R","A","E","T","Q","N","D","V","L","...3ii6[0.121,0.097,0.089,0.088,0.087,0.085,0.064,0.0...4
421203["V","I","L","A","M","T","F","S","C","Y","E","...3ii6[0.444,0.227,0.128,0.033,0.029,0.026,0.021,0.0...5
........................
19701203["L","V","I","A","T","K","S","Q","E","F","M","...3ii6[0.286,0.099,0.079,0.07,0.051,0.046,0.046,0.03...198
19801203["L","A","K","E","S","V","T","R","I","Q","N","...3ii6[0.156,0.08,0.075,0.071,0.068,0.067,0.066,0.05...199
19901203["N","S","E","K","A","D","Q","R","T","V","L","...3ii6[0.109,0.106,0.103,0.094,0.077,0.068,0.067,0.0...200
20001203["E","A","K","S","D","T","N","Q","V","R","L","...3ii6[0.142,0.093,0.081,0.074,0.072,0.065,0.057,0.0...201
20101203["A","V","S","L","I","T","K","E","G","D","R","...3ii6[0.136,0.094,0.083,0.082,0.071,0.066,0.055,0.0...202

202 rows × 7 columns

Visualization

import matplotlib.pyplot as plt
import seaborn as sns
import orjson as json
plt.style.use('ggplot')
expanded_seq_conser_df = DataFrame(
    seq_conser_df.apply(lambda x: dict(zip(json.loads(x['letter_array']), json.loads(x['proba_array']))), axis=1).tolist(),
    index=seq_conser_df.residue_number
)
expanded_seq_conser_df

MLIVAFTSKREQGYNDPHCW
residue_number
10.2170.1680.0960.0960.0540.0420.0400.0390.0360.0290.0280.0240.0230.0230.0210.0190.0140.0130.0110.007
20.0130.0320.0210.0300.0620.0120.0470.0710.0820.0510.2390.0620.0410.0150.0650.1010.0230.0250.0060.004
30.0170.0380.0240.0340.0640.0130.0710.0700.1510.1580.0800.0640.0450.0160.0490.0450.0210.0280.0070.004
40.0190.0420.0310.0440.0880.0150.0850.0970.1210.0890.0870.0640.0320.0180.0530.0500.0210.0320.0070.005
50.0290.1280.2270.4440.0330.0210.0260.0130.0080.0070.0080.0070.0070.0090.0060.0050.0060.0040.0100.003
...............................................................
1980.0340.2860.0790.0990.0700.0340.0510.0460.0460.0300.0380.0390.0220.0260.0250.0190.0180.0170.0140.007
1990.0280.1560.0510.0670.0800.0270.0660.0680.0750.0590.0710.0510.0290.0230.0440.0360.0230.0230.0150.007
2000.0180.0420.0270.0430.0770.0160.0560.1060.0940.0580.1030.0670.0360.0170.1090.0680.0230.0290.0080.005
2010.0200.0480.0370.0560.0930.0180.0650.0740.0810.0550.1420.0560.0450.0180.0570.0720.0240.0250.0100.005
2020.0250.0820.0710.0940.1360.0290.0660.0830.0550.0430.0530.0360.0520.0220.0410.0430.0260.0190.0170.006

202 rows × 20 columns

plt.figure(figsize=(10,8))
sns.heatmap(expanded_seq_conser_df, cmap='viridis')
HeatMap
sns.clustermap(expanded_seq_conser_df, cmap='viridis', method='ward')
ClusterMap