Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ElmerCSC
GitHub Repository: ElmerCSC/elmerfem
Path: blob/devel/elmergrid/src/metis-5.1.0/GKlib/pdb.c
3206 views
1
/************************************************************************/
2
/*! \file pdb.c
3
4
\brief Functions for parsing pdb files.
5
6
Pdb reader (parser). Loads arrays of pointers for easy backbone access.
7
8
\date Started 10/20/06
9
\author Kevin
10
\version $Id: pdb.c 10711 2011-08-31 22:23:04Z karypis $
11
*/
12
/************************************************************************/
13
#include <GKlib.h>
14
15
/************************************************************************/
16
/*! \brief Converts three-letter amino acid codes to one-leter codes.
17
18
This function takes a three letter \c * and converts it to a single \c
19
20
\param res is the three-letter code to be converted.
21
\returns A \c representing the amino acid.
22
*/
23
/************************************************************************/
24
char gk_threetoone(char *res) { /* {{{ */
25
/* make sure the matching works */
26
res[0] = toupper(res[0]);
27
res[1] = toupper(res[1]);
28
res[2] = toupper(res[2]);
29
if(strcmp(res,"ALA") == 0) {
30
return 'A';
31
}
32
else if(strcmp(res,"CYS") == 0) {
33
return 'C';
34
}
35
else if(strcmp(res,"ASP") == 0) {
36
return 'D';
37
}
38
else if(strcmp(res,"GLU") == 0) {
39
return 'E';
40
}
41
else if(strcmp(res,"PHE") == 0) {
42
return 'F';
43
}
44
else if(strcmp(res,"GLY") == 0) {
45
return 'G';
46
}
47
else if(strcmp(res,"HIS") == 0) {
48
return 'H';
49
}
50
else if(strcmp(res,"ILE") == 0) {
51
return 'I';
52
}
53
else if(strcmp(res,"LYS") == 0) {
54
return 'K';
55
}
56
else if(strcmp(res,"LEU") == 0) {
57
return 'L';
58
}
59
else if(strcmp(res,"MET") == 0) {
60
return 'M';
61
}
62
else if(strcmp(res,"ASN") == 0) {
63
return 'N';
64
}
65
else if(strcmp(res,"PRO") == 0) {
66
return 'P';
67
}
68
else if(strcmp(res,"GLN") == 0) {
69
return 'Q';
70
}
71
else if(strcmp(res,"ARG") == 0) {
72
return 'R';
73
}
74
else if(strcmp(res,"SER") == 0) {
75
return 'S';
76
}
77
else if(strcmp(res,"THR") == 0) {
78
return 'T';
79
}
80
else if(strcmp(res,"SCY") == 0) {
81
return 'U';
82
}
83
else if(strcmp(res,"VAL") == 0) {
84
return 'V';
85
}
86
else if(strcmp(res,"TRP") == 0) {
87
return 'W';
88
}
89
else if(strcmp(res,"TYR") == 0) {
90
return 'Y';
91
}
92
else {
93
return 'X';
94
}
95
} /* }}} */
96
97
/************************************************************************/
98
/*! \brief Frees the memory of a pdbf structure.
99
100
This function takes a pdbf pointer and frees all the memory below it.
101
102
\param p is the pdbf structure to be freed.
103
*/
104
/************************************************************************/
105
void gk_freepdbf(pdbf *p) { /* {{{ */
106
int i;
107
if(p != NULL) {
108
gk_free((void **)&p->resSeq, LTERM);
109
for(i=0; i<p->natoms; i++) {
110
gk_free((void **)&p->atoms[i].name, &p->atoms[i].resname, LTERM);
111
}
112
for(i=0; i<p->nresidues; i++) {
113
gk_free((void *)&p->threeresSeq[i], LTERM);
114
}
115
/* this may look like it's wrong, but it's just a 1-d array of pointers, and
116
the pointers themselves are freed above */
117
gk_free((void **)&p->bbs, &p->cas, &p->atoms, &p->cm, &p->threeresSeq, LTERM);
118
}
119
gk_free((void **)&p, LTERM);
120
} /* }}} */
121
122
/************************************************************************/
123
/*! \brief Reads a pdb file into a pdbf structure
124
125
This function allocates a pdbf structure and reads the file fname into
126
that structure.
127
128
\param fname is the file name to be read
129
\returns A filled pdbf structure.
130
*/
131
/************************************************************************/
132
pdbf *gk_readpdbfile(char *fname) { /* {{{ */
133
int i=0, res=0;
134
char linetype[6];
135
int aserial;
136
char aname[5] = " \0";
137
char altLoc = ' ';
138
char rname[4] = " \0";
139
char chainid = ' ';
140
char oldchainid = ' ';
141
int rserial;
142
int oldRserial = -37;
143
char icode = ' ';
144
char element = ' ';
145
double x;
146
double y;
147
double z;
148
double avgx;
149
double avgy;
150
double avgz;
151
double opcy;
152
double tmpt;
153
char line[MAXLINELEN];
154
int corruption=0;
155
int nresatoms;
156
157
int atoms=0, residues=0, cas=0, bbs=0, firstres=1;
158
pdbf *toFill = gk_malloc(sizeof(pdbf),"fillme");
159
FILE *FPIN;
160
161
FPIN = gk_fopen(fname,"r",fname);
162
while(fgets(line, 256, FPIN)) {
163
sscanf(line,"%s ",linetype);
164
/* It seems the only reliable parts are through temperature, so we only use these parts */
165
/* if(strstr(linetype, "ATOM") != NULL || strstr(linetype, "HETATM") != NULL) { */
166
if(strstr(linetype, "ATOM") != NULL) {
167
sscanf(line, "%6s%5d%*1c%4c%1c%3c%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf %c\n",
168
linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,&element);
169
sscanf(linetype, " %s ",linetype);
170
sscanf(aname, " %s ",aname);
171
sscanf(rname, " %s ",rname);
172
if(altLoc != ' ') {
173
corruption = corruption|CRP_ALTLOCS;
174
}
175
176
if(firstres == 1) {
177
oldRserial = rserial;
178
oldchainid = chainid;
179
residues++;
180
firstres = 0;
181
}
182
if(oldRserial != rserial) {
183
residues++;
184
oldRserial = rserial;
185
}
186
if(oldchainid != chainid) {
187
corruption = corruption|CRP_MULTICHAIN;
188
}
189
oldchainid = chainid;
190
atoms++;
191
if(strcmp(aname,"CA") == 0) {
192
cas++;
193
}
194
if(strcmp(aname,"N") == 0 || strcmp(aname,"CA") == 0 ||
195
strcmp(aname,"C") == 0 || strcmp(aname,"O") == 0) {
196
bbs++;
197
}
198
}
199
else if(strstr(linetype, "ENDMDL") != NULL || strstr(linetype, "END") != NULL || strstr(linetype, "TER") != NULL) {
200
break;
201
}
202
}
203
fclose(FPIN);
204
205
/* printf("File has coordinates for %d atoms in %d residues\n",atoms,residues); */
206
toFill->natoms = atoms;
207
toFill->ncas = cas;
208
toFill->nbbs = bbs;
209
toFill->nresidues = residues;
210
toFill->resSeq = (char *) gk_malloc (residues*sizeof(char),"residue seq");
211
toFill->threeresSeq = (char **)gk_malloc (residues*sizeof(char *),"residue seq");
212
toFill->atoms = (atom *) gk_malloc (atoms*sizeof(atom), "atoms");
213
toFill->bbs = (atom **)gk_malloc ( bbs*sizeof(atom *),"bbs");
214
toFill->cas = (atom **)gk_malloc ( cas*sizeof(atom *),"cas");
215
toFill->cm = (center_of_mass *)gk_malloc(residues*sizeof(center_of_mass),"center of mass");
216
res=0; firstres=1; cas=0; bbs=0; i=0;
217
avgx = 0.0; avgy = 0.0; avgz = 0.0;
218
nresatoms = 0;
219
220
FPIN = gk_fopen(fname,"r",fname);
221
while(fgets(line, 256, FPIN)) {
222
sscanf(line,"%s ",linetype);
223
/* It seems the only reliable parts are through temperature, so we only use these parts */
224
/* if(strstr(linetype, "ATOM") != NULL || strstr(linetype, "HETATM") != NULL) { */
225
if(strstr(linetype, "ATOM") != NULL ) {
226
227
/* to ensure our memory doesn't get corrupted by the biologists, we only read this far */
228
sscanf(line, "%6s%5d%*1c%4c%1c%3c%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf %c\n",
229
linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,&element);
230
sscanf(aname, "%s",aname);
231
sscanf(rname, "%s",rname);
232
233
if(firstres == 1) {
234
toFill->resSeq[res] = gk_threetoone(rname);
235
toFill->threeresSeq[res] = gk_strdup(rname);
236
oldRserial = rserial;
237
res++;
238
firstres = 0;
239
}
240
if(oldRserial != rserial) {
241
/* we're changing residues. store the center of mass from the last one & reset */
242
toFill->cm[res-1].x = avgx/nresatoms;
243
toFill->cm[res-1].y = avgy/nresatoms;
244
toFill->cm[res-1].z = avgz/nresatoms;
245
avgx = 0.0; avgy = 0.0; avgz = 0.0;
246
nresatoms = 0;
247
toFill->cm[res-1].name = toFill->resSeq[res-1];
248
249
toFill->threeresSeq[res] = gk_strdup(rname);
250
toFill->resSeq[res] = gk_threetoone(rname);
251
res++;
252
oldRserial = rserial;
253
}
254
avgx += x;
255
avgy += y;
256
avgz += z;
257
nresatoms++;
258
259
toFill->atoms[i].x = x;
260
toFill->atoms[i].y = y;
261
toFill->atoms[i].z = z;
262
toFill->atoms[i].opcy = opcy;
263
toFill->atoms[i].tmpt = tmpt;
264
toFill->atoms[i].element = element;
265
toFill->atoms[i].serial = aserial;
266
toFill->atoms[i].chainid = chainid;
267
toFill->atoms[i].altLoc = altLoc;
268
toFill->atoms[i].rserial = rserial;
269
toFill->atoms[i].icode = icode;
270
toFill->atoms[i].name = gk_strdup(aname);
271
toFill->atoms[i].resname = gk_strdup(rname);
272
/* Set up pointers for the backbone and c-alpha shortcuts */
273
if(strcmp(aname,"CA") == 0) {
274
toFill->cas[cas] = &(toFill->atoms[i]);
275
cas++;
276
}
277
if(strcmp(aname,"N") == 0 || strcmp(aname,"CA") == 0 || strcmp(aname,"C") == 0 || strcmp(aname,"O") == 0) {
278
toFill->bbs[bbs] = &(toFill->atoms[i]);
279
bbs++;
280
}
281
i++;
282
}
283
else if(strstr(linetype, "ENDMDL") != NULL || strstr(linetype, "END") != NULL || strstr(linetype, "TER") != NULL) {
284
break;
285
}
286
}
287
/* get that last average */
288
toFill->cm[res-1].x = avgx/nresatoms;
289
toFill->cm[res-1].y = avgy/nresatoms;
290
toFill->cm[res-1].z = avgz/nresatoms;
291
/* Begin test code */
292
if(cas != residues) {
293
printf("Number of residues and CA coordinates differs by %d (!)\n",residues-cas);
294
if(cas < residues) {
295
corruption = corruption|CRP_MISSINGCA;
296
}
297
else if(cas > residues) {
298
corruption = corruption|CRP_MULTICA;
299
}
300
}
301
if(bbs < residues*4) {
302
corruption = corruption|CRP_MISSINGBB;
303
}
304
else if(bbs > residues*4) {
305
corruption = corruption|CRP_MULTIBB;
306
}
307
fclose(FPIN);
308
toFill->corruption = corruption;
309
/* if(corruption == 0)
310
printf("File was clean!\n"); */
311
return(toFill);
312
} /* }}} */
313
314
/************************************************************************/
315
/*! \brief Writes the sequence of residues from a pdb file.
316
317
This function takes a pdbf structure and a filename, and writes out
318
the amino acid sequence according to the atomic coordinates. The output
319
is in fasta format.
320
321
322
\param p is the pdbf structure with the sequence of interest
323
\param fname is the file name to be written
324
*/
325
/************************************************************************/
326
void gk_writefastafrompdb(pdbf *pb, char *fname) {
327
int i;
328
FILE *FPOUT;
329
330
FPOUT = gk_fopen(fname,"w",fname);
331
fprintf(FPOUT,"> %s\n",fname);
332
333
for(i=0; i<pb->nresidues; i++)
334
fprintf(FPOUT,"%c",pb->resSeq[i]);
335
336
fprintf(FPOUT,"\n");
337
fclose(FPOUT);
338
}
339
340
/************************************************************************/
341
/*! \brief Writes all centers of mass in pdb-format to file fname.
342
343
This function takes a pdbf structure and writes out the calculated
344
mass center information to file fname as though each one was a c-alpha.
345
346
\param p is the pdbf structure to write out
347
\param fname is the file name to be written
348
*/
349
/************************************************************************/
350
void gk_writecentersofmass(pdbf *p, char *fname) {
351
int i;
352
FILE *FPIN;
353
FPIN = gk_fopen(fname,"w",fname);
354
for(i=0; i<p->nresidues; i++) {
355
fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
356
"ATOM ",i,"CA",' ',p->threeresSeq[i],' ',i,' ',p->cm[i].x,p->cm[i].y,p->cm[i].z,1.0,-37.0);
357
}
358
fclose(FPIN);
359
}
360
361
/************************************************************************/
362
/*! \brief Writes all atoms in p in pdb-format to file fname.
363
364
This function takes a pdbf structure and writes out all the atom
365
information to file fname.
366
367
\param p is the pdbf structure to write out
368
\param fname is the file name to be written
369
*/
370
/************************************************************************/
371
void gk_writefullatom(pdbf *p, char *fname) {
372
int i;
373
FILE *FPIN;
374
FPIN = gk_fopen(fname,"w",fname);
375
for(i=0; i<p->natoms; i++) {
376
fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
377
"ATOM ",p->atoms[i].serial,p->atoms[i].name,p->atoms[i].altLoc,p->atoms[i].resname,p->atoms[i].chainid,p->atoms[i].rserial,p->atoms[i].icode,p->atoms[i].x,p->atoms[i].y,p->atoms[i].z,p->atoms[i].opcy,p->atoms[i].tmpt);
378
}
379
fclose(FPIN);
380
}
381
382
/************************************************************************/
383
/*! \brief Writes out all the backbone atoms of a structure in pdb format
384
385
This function takes a pdbf structure p and writes only the backbone atoms
386
to a filename fname.
387
388
\param p is the pdb structure to write out.
389
\param fname is the file name to be written.
390
*/
391
/************************************************************************/
392
void gk_writebackbone(pdbf *p, char *fname) {
393
int i;
394
FILE *FPIN;
395
FPIN = gk_fopen(fname,"w",fname);
396
for(i=0; i<p->nbbs; i++) {
397
fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
398
"ATOM ",p->bbs[i]->serial,p->bbs[i]->name,p->bbs[i]->altLoc,p->bbs[i]->resname,p->bbs[i]->chainid,p->bbs[i]->rserial,p->bbs[i]->icode,p->bbs[i]->x,p->bbs[i]->y,p->bbs[i]->z,p->bbs[i]->opcy,p->bbs[i]->tmpt);
399
}
400
fclose(FPIN);
401
}
402
403
/************************************************************************/
404
/*! \brief Writes out all the alpha carbon atoms of a structure
405
406
This function takes a pdbf structure p and writes only the alpha carbon
407
atoms to a filename fname.
408
409
\param p is the pdb structure to write out.
410
\param fname is the file name to be written.
411
*/
412
/************************************************************************/
413
void gk_writealphacarbons(pdbf *p, char *fname) {
414
int i;
415
FILE *FPIN;
416
FPIN = gk_fopen(fname,"w",fname);
417
for(i=0; i<p->ncas; i++) {
418
fprintf(FPIN,"%-6s%5d %4s%1c%3s %1c%4d%1c %8.3lf%8.3lf%8.3lf%6.2f%6.2f\n",
419
"ATOM ",p->cas[i]->serial,p->cas[i]->name,p->cas[i]->altLoc,p->cas[i]->resname,p->cas[i]->chainid,p->cas[i]->rserial,p->cas[i]->icode,p->cas[i]->x,p->cas[i]->y,p->cas[i]->z,p->cas[i]->opcy,p->cas[i]->tmpt);
420
}
421
fclose(FPIN);
422
}
423
424
/************************************************************************/
425
/*! \brief Decodes the corruption bitswitch and prints any problems
426
427
Due to the totally unreliable nature of the pdb format, reading a pdb
428
file stores a corruption bitswitch, and this function decodes that switch
429
and prints the result on stdout.
430
431
\param p is the pdb structure to write out.
432
\param fname is the file name to be written.
433
*/
434
/************************************************************************/
435
void gk_showcorruption(pdbf *p) {
436
int corruption = p->corruption;
437
if(corruption&CRP_ALTLOCS)
438
printf("Multiple coordinate sets for at least one atom\n");
439
if(corruption&CRP_MISSINGCA)
440
printf("Missing coordiantes for at least one CA atom\n");
441
if(corruption&CRP_MISSINGBB)
442
printf("Missing coordiantes for at least one backbone atom (N,CA,C,O)\n");
443
if(corruption&CRP_MULTICHAIN)
444
printf("File contains coordinates for multiple chains\n");
445
if(corruption&CRP_MULTICA)
446
printf("Multiple CA atoms found for the same residue (could be alternate locators)\n");
447
if(corruption&CRP_MULTICA)
448
printf("Multiple copies of backbone atoms found for the same residue (could be alternate locators)\n");
449
}
450
/* sscanf(line, "%6s%5d%*1c%4s%1c%3s%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf%*6c%4s%2s%2s\n",
451
linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,segId,element,charge);
452
printf(".%s.%s.%s.\n",segId,element,charge);
453
printf("%-6s%5d%-1s%-4s%1c%3s%1s%1c%4d%1c%3s%8.3lf%8.3lf%8.3lf%6.2f%6.2f%6s%4s%2s%2s\n",
454
linetype,aserial," ",aname,altLoc,rname," ",chainid,rserial,icode," ",x,y,z,opcy,tmpt," ",segId,element,charge); */
455
456
/* and we could probably get away with this using astral files, */
457
/* sscanf(line, "%6s%5d%*1c%4s%1c%3s%*1c%1c%4d%1c%*3c%8lf%8lf%8lf%6lf%6lf%*6c%6s\n",
458
linetype,&aserial,aname,&altLoc,rname,&chainid,&rserial,&icode,&x,&y,&z,&opcy,&tmpt,element);
459
printf("%-6s%5d%-1s%-4s%1c%3s%1s%1c%4d%1c%3s%8.3lf%8.3lf%8.3lf%6.2f%6.2f%6s%6s\n",
460
linetype,aserial," ",aname,altLoc,rname," ",chainid,rserial,icode," ",x,y,z,opcy,tmpt," ",element); */
461
462