Path: blob/devel/elmergrid/src/metis-5.1.0/GKlib/pdb.c
3206 views
/************************************************************************/1/*! \file pdb.c23\brief Functions for parsing pdb files.45Pdb reader (parser). Loads arrays of pointers for easy backbone access.67\date Started 10/20/068\author Kevin9\version $Id: pdb.c 10711 2011-08-31 22:23:04Z karypis $10*/11/************************************************************************/12#include <GKlib.h>1314/************************************************************************/15/*! \brief Converts three-letter amino acid codes to one-leter codes.1617This function takes a three letter \c * and converts it to a single \c1819\param res is the three-letter code to be converted.20\returns A \c representing the amino acid.21*/22/************************************************************************/23char gk_threetoone(char *res) { /* {{{ */24/* make sure the matching works */25res[0] = toupper(res[0]);26res[1] = toupper(res[1]);27res[2] = toupper(res[2]);28if(strcmp(res,"ALA") == 0) {29return 'A';30}31else if(strcmp(res,"CYS") == 0) {32return 'C';33}34else if(strcmp(res,"ASP") == 0) {35return 'D';36}37else if(strcmp(res,"GLU") == 0) {38return 'E';39}40else if(strcmp(res,"PHE") == 0) {41return 'F';42}43else if(strcmp(res,"GLY") == 0) {44return 'G';45}46else if(strcmp(res,"HIS") == 0) {47return 'H';48}49else if(strcmp(res,"ILE") == 0) {50return 'I';51}52else if(strcmp(res,"LYS") == 0) {53return 'K';54}55else if(strcmp(res,"LEU") == 0) {56return 'L';57}58else if(strcmp(res,"MET") == 0) {59return 'M';60}61else if(strcmp(res,"ASN") == 0) {62return 'N';63}64else if(strcmp(res,"PRO") == 0) {65return 'P';66}67else if(strcmp(res,"GLN") == 0) {68return 'Q';69}70else if(strcmp(res,"ARG") == 0) {71return 'R';72}73else if(strcmp(res,"SER") == 0) {74return 'S';75}76else if(strcmp(res,"THR") == 0) {77return 'T';78}79else if(strcmp(res,"SCY") == 0) {80return 'U';81}82else if(strcmp(res,"VAL") == 0) {83return 'V';84}85else if(strcmp(res,"TRP") == 0) {86return 'W';87}88else if(strcmp(res,"TYR") == 0) {89return 'Y';90}91else {92return 'X';93}94} /* }}} */9596/************************************************************************/97/*! \brief Frees the memory of a pdbf structure.9899This function takes a pdbf pointer and frees all the memory below it.100101\param p is the pdbf structure to be freed.102*/103/************************************************************************/104void gk_freepdbf(pdbf *p) { /* {{{ */105int i;106if(p != NULL) {107gk_free((void **)&p->resSeq, LTERM);108for(i=0; i<p->natoms; i++) {109gk_free((void **)&p->atoms[i].name, &p->atoms[i].resname, LTERM);110}111for(i=0; i<p->nresidues; i++) {112gk_free((void *)&p->threeresSeq[i], LTERM);113}114/* this may look like it's wrong, but it's just a 1-d array of pointers, and115the pointers themselves are freed above */116gk_free((void **)&p->bbs, &p->cas, &p->atoms, &p->cm, &p->threeresSeq, LTERM);117}118gk_free((void **)&p, LTERM);119} /* }}} */120121/************************************************************************/122/*! \brief Reads a pdb file into a pdbf structure123124This function allocates a pdbf structure and reads the file fname into125that structure.126127\param fname is the file name to be read128\returns A filled pdbf structure.129*/130/************************************************************************/131pdbf *gk_readpdbfile(char *fname) { /* {{{ */132int i=0, res=0;133char linetype[6];134int aserial;135char aname[5] = " \0";136char altLoc = ' ';137char rname[4] = " \0";138char chainid = ' ';139char oldchainid = ' ';140int rserial;141int oldRserial = -37;142char icode = ' ';143char element = ' ';144double x;145double y;146double z;147double avgx;148double avgy;149double avgz;150double opcy;151double tmpt;152char line[MAXLINELEN];153int corruption=0;154int nresatoms;155156int atoms=0, residues=0, cas=0, bbs=0, firstres=1;157pdbf *toFill = gk_malloc(sizeof(pdbf),"fillme");158FILE *FPIN;159160FPIN = gk_fopen(fname,"r",fname);161while(fgets(line, 256, FPIN)) {162sscanf(line,"%s ",linetype);163/* It seems the only reliable parts are through temperature, so we only use these parts */164/* if(strstr(linetype, "ATOM") != NULL || strstr(linetype, "HETATM") != NULL) { */165if(strstr(linetype, "ATOM") != NULL) {166sscanf(line, "%6s%5d%*1c%4c%1c%3c%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf %c\n",167linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,&element);168sscanf(linetype, " %s ",linetype);169sscanf(aname, " %s ",aname);170sscanf(rname, " %s ",rname);171if(altLoc != ' ') {172corruption = corruption|CRP_ALTLOCS;173}174175if(firstres == 1) {176oldRserial = rserial;177oldchainid = chainid;178residues++;179firstres = 0;180}181if(oldRserial != rserial) {182residues++;183oldRserial = rserial;184}185if(oldchainid != chainid) {186corruption = corruption|CRP_MULTICHAIN;187}188oldchainid = chainid;189atoms++;190if(strcmp(aname,"CA") == 0) {191cas++;192}193if(strcmp(aname,"N") == 0 || strcmp(aname,"CA") == 0 ||194strcmp(aname,"C") == 0 || strcmp(aname,"O") == 0) {195bbs++;196}197}198else if(strstr(linetype, "ENDMDL") != NULL || strstr(linetype, "END") != NULL || strstr(linetype, "TER") != NULL) {199break;200}201}202fclose(FPIN);203204/* printf("File has coordinates for %d atoms in %d residues\n",atoms,residues); */205toFill->natoms = atoms;206toFill->ncas = cas;207toFill->nbbs = bbs;208toFill->nresidues = residues;209toFill->resSeq = (char *) gk_malloc (residues*sizeof(char),"residue seq");210toFill->threeresSeq = (char **)gk_malloc (residues*sizeof(char *),"residue seq");211toFill->atoms = (atom *) gk_malloc (atoms*sizeof(atom), "atoms");212toFill->bbs = (atom **)gk_malloc ( bbs*sizeof(atom *),"bbs");213toFill->cas = (atom **)gk_malloc ( cas*sizeof(atom *),"cas");214toFill->cm = (center_of_mass *)gk_malloc(residues*sizeof(center_of_mass),"center of mass");215res=0; firstres=1; cas=0; bbs=0; i=0;216avgx = 0.0; avgy = 0.0; avgz = 0.0;217nresatoms = 0;218219FPIN = gk_fopen(fname,"r",fname);220while(fgets(line, 256, FPIN)) {221sscanf(line,"%s ",linetype);222/* It seems the only reliable parts are through temperature, so we only use these parts */223/* if(strstr(linetype, "ATOM") != NULL || strstr(linetype, "HETATM") != NULL) { */224if(strstr(linetype, "ATOM") != NULL ) {225226/* to ensure our memory doesn't get corrupted by the biologists, we only read this far */227sscanf(line, "%6s%5d%*1c%4c%1c%3c%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf %c\n",228linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,&element);229sscanf(aname, "%s",aname);230sscanf(rname, "%s",rname);231232if(firstres == 1) {233toFill->resSeq[res] = gk_threetoone(rname);234toFill->threeresSeq[res] = gk_strdup(rname);235oldRserial = rserial;236res++;237firstres = 0;238}239if(oldRserial != rserial) {240/* we're changing residues. store the center of mass from the last one & reset */241toFill->cm[res-1].x = avgx/nresatoms;242toFill->cm[res-1].y = avgy/nresatoms;243toFill->cm[res-1].z = avgz/nresatoms;244avgx = 0.0; avgy = 0.0; avgz = 0.0;245nresatoms = 0;246toFill->cm[res-1].name = toFill->resSeq[res-1];247248toFill->threeresSeq[res] = gk_strdup(rname);249toFill->resSeq[res] = gk_threetoone(rname);250res++;251oldRserial = rserial;252}253avgx += x;254avgy += y;255avgz += z;256nresatoms++;257258toFill->atoms[i].x = x;259toFill->atoms[i].y = y;260toFill->atoms[i].z = z;261toFill->atoms[i].opcy = opcy;262toFill->atoms[i].tmpt = tmpt;263toFill->atoms[i].element = element;264toFill->atoms[i].serial = aserial;265toFill->atoms[i].chainid = chainid;266toFill->atoms[i].altLoc = altLoc;267toFill->atoms[i].rserial = rserial;268toFill->atoms[i].icode = icode;269toFill->atoms[i].name = gk_strdup(aname);270toFill->atoms[i].resname = gk_strdup(rname);271/* Set up pointers for the backbone and c-alpha shortcuts */272if(strcmp(aname,"CA") == 0) {273toFill->cas[cas] = &(toFill->atoms[i]);274cas++;275}276if(strcmp(aname,"N") == 0 || strcmp(aname,"CA") == 0 || strcmp(aname,"C") == 0 || strcmp(aname,"O") == 0) {277toFill->bbs[bbs] = &(toFill->atoms[i]);278bbs++;279}280i++;281}282else if(strstr(linetype, "ENDMDL") != NULL || strstr(linetype, "END") != NULL || strstr(linetype, "TER") != NULL) {283break;284}285}286/* get that last average */287toFill->cm[res-1].x = avgx/nresatoms;288toFill->cm[res-1].y = avgy/nresatoms;289toFill->cm[res-1].z = avgz/nresatoms;290/* Begin test code */291if(cas != residues) {292printf("Number of residues and CA coordinates differs by %d (!)\n",residues-cas);293if(cas < residues) {294corruption = corruption|CRP_MISSINGCA;295}296else if(cas > residues) {297corruption = corruption|CRP_MULTICA;298}299}300if(bbs < residues*4) {301corruption = corruption|CRP_MISSINGBB;302}303else if(bbs > residues*4) {304corruption = corruption|CRP_MULTIBB;305}306fclose(FPIN);307toFill->corruption = corruption;308/* if(corruption == 0)309printf("File was clean!\n"); */310return(toFill);311} /* }}} */312313/************************************************************************/314/*! \brief Writes the sequence of residues from a pdb file.315316This function takes a pdbf structure and a filename, and writes out317the amino acid sequence according to the atomic coordinates. The output318is in fasta format.319320321\param p is the pdbf structure with the sequence of interest322\param fname is the file name to be written323*/324/************************************************************************/325void gk_writefastafrompdb(pdbf *pb, char *fname) {326int i;327FILE *FPOUT;328329FPOUT = gk_fopen(fname,"w",fname);330fprintf(FPOUT,"> %s\n",fname);331332for(i=0; i<pb->nresidues; i++)333fprintf(FPOUT,"%c",pb->resSeq[i]);334335fprintf(FPOUT,"\n");336fclose(FPOUT);337}338339/************************************************************************/340/*! \brief Writes all centers of mass in pdb-format to file fname.341342This function takes a pdbf structure and writes out the calculated343mass center information to file fname as though each one was a c-alpha.344345\param p is the pdbf structure to write out346\param fname is the file name to be written347*/348/************************************************************************/349void gk_writecentersofmass(pdbf *p, char *fname) {350int i;351FILE *FPIN;352FPIN = gk_fopen(fname,"w",fname);353for(i=0; i<p->nresidues; i++) {354fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",355"ATOM ",i,"CA",' ',p->threeresSeq[i],' ',i,' ',p->cm[i].x,p->cm[i].y,p->cm[i].z,1.0,-37.0);356}357fclose(FPIN);358}359360/************************************************************************/361/*! \brief Writes all atoms in p in pdb-format to file fname.362363This function takes a pdbf structure and writes out all the atom364information to file fname.365366\param p is the pdbf structure to write out367\param fname is the file name to be written368*/369/************************************************************************/370void gk_writefullatom(pdbf *p, char *fname) {371int i;372FILE *FPIN;373FPIN = gk_fopen(fname,"w",fname);374for(i=0; i<p->natoms; i++) {375fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",376"ATOM ",p->atoms[i].serial,p->atoms[i].name,p->atoms[i].altLoc,p->atoms[i].resname,p->atoms[i].chainid,p->atoms[i].rserial,p->atoms[i].icode,p->atoms[i].x,p->atoms[i].y,p->atoms[i].z,p->atoms[i].opcy,p->atoms[i].tmpt);377}378fclose(FPIN);379}380381/************************************************************************/382/*! \brief Writes out all the backbone atoms of a structure in pdb format383384This function takes a pdbf structure p and writes only the backbone atoms385to a filename fname.386387\param p is the pdb structure to write out.388\param fname is the file name to be written.389*/390/************************************************************************/391void gk_writebackbone(pdbf *p, char *fname) {392int i;393FILE *FPIN;394FPIN = gk_fopen(fname,"w",fname);395for(i=0; i<p->nbbs; i++) {396fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",397"ATOM ",p->bbs[i]->serial,p->bbs[i]->name,p->bbs[i]->altLoc,p->bbs[i]->resname,p->bbs[i]->chainid,p->bbs[i]->rserial,p->bbs[i]->icode,p->bbs[i]->x,p->bbs[i]->y,p->bbs[i]->z,p->bbs[i]->opcy,p->bbs[i]->tmpt);398}399fclose(FPIN);400}401402/************************************************************************/403/*! \brief Writes out all the alpha carbon atoms of a structure404405This function takes a pdbf structure p and writes only the alpha carbon406atoms to a filename fname.407408\param p is the pdb structure to write out.409\param fname is the file name to be written.410*/411/************************************************************************/412void gk_writealphacarbons(pdbf *p, char *fname) {413int i;414FILE *FPIN;415FPIN = gk_fopen(fname,"w",fname);416for(i=0; i<p->ncas; i++) {417fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",418"ATOM ",p->cas[i]->serial,p->cas[i]->name,p->cas[i]->altLoc,p->cas[i]->resname,p->cas[i]->chainid,p->cas[i]->rserial,p->cas[i]->icode,p->cas[i]->x,p->cas[i]->y,p->cas[i]->z,p->cas[i]->opcy,p->cas[i]->tmpt);419}420fclose(FPIN);421}422423/************************************************************************/424/*! \brief Decodes the corruption bitswitch and prints any problems425426Due to the totally unreliable nature of the pdb format, reading a pdb427file stores a corruption bitswitch, and this function decodes that switch428and prints the result on stdout.429430\param p is the pdb structure to write out.431\param fname is the file name to be written.432*/433/************************************************************************/434void gk_showcorruption(pdbf *p) {435int corruption = p->corruption;436if(corruption&CRP_ALTLOCS)437printf("Multiple coordinate sets for at least one atom\n");438if(corruption&CRP_MISSINGCA)439printf("Missing coordiantes for at least one CA atom\n");440if(corruption&CRP_MISSINGBB)441printf("Missing coordiantes for at least one backbone atom (N,CA,C,O)\n");442if(corruption&CRP_MULTICHAIN)443printf("File contains coordinates for multiple chains\n");444if(corruption&CRP_MULTICA)445printf("Multiple CA atoms found for the same residue (could be alternate locators)\n");446if(corruption&CRP_MULTICA)447printf("Multiple copies of backbone atoms found for the same residue (could be alternate locators)\n");448}449/* sscanf(line, "%6s%5d%*1c%4s%1c%3s%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf%*6c%4s%2s%2s\n",450linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,segId,element,charge);451printf(".%s.%s.%s.\n",segId,element,charge);452printf("%-6s%5d%-1s%-4s%1c%3s%1s%1c%4d%1c%3s%8.3lf%8.3lf%8.3lf%6.2f%6.2f%6s%4s%2s%2s\n",453linetype,aserial," ",aname,altLoc,rname," ",chainid,rserial,icode," ",x,y,z,opcy,tmpt," ",segId,element,charge); */454455/* and we could probably get away with this using astral files, */456/* sscanf(line, "%6s%5d%*1c%4s%1c%3s%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf%*6c%6s\n",457linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,element);458printf("%-6s%5d%-1s%-4s%1c%3s%1s%1c%4d%1c%3s%8.3lf%8.3lf%8.3lf%6.2f%6.2f%6s%6s\n",459linetype,aserial," ",aname,altLoc,rname," ",chainid,rserial,icode," ",x,y,z,opcy,tmpt," ",element); */460461462