///////////////////////////////////////////////////////////////////////// // Gauss Integrals - A tool for describing protein backbone geometry // // // // Authors: Peter R{\o}gen & Boris Fain // ///////////////////////////////////////////////////////////////////////// /* Copyright (c) The researchers and professors of the Technical University of Denmark. All Rights Reserved This program may as of May 11'th 2011 be dawnloades and used under the GNU general Public License version 3 CITATIONS For publication of results, please cite: P. R{\o}gen & B. Fain, Automatic classification of protein structure by using Gauss integrals, PNAS, 100(1), 119-124, 2003. */ #include #include #include #include #define MAXLENGTH 1300 #define dipeptide 0 /*if 1 dipeptide ferquences of chains are printed*/ #define peptidesequence 0 /*if 1 peptide sequence of chains are printed*/ #define CylinderTransform 1 /*The Cylindertransform is normalized such that chain length and each structural measure has derivation one on a set of representatives of the homology (H) classes of CATH 2.4. Under this transform the Scaled Gauss Metric, SGM, introduced in P. R{\o}gen & B. Fain, Automatic classification of protein structure by using Gauss integrals, PNAS, 100(1), 119-124, 2003. is given by the usual scalar product in R^30 of the last 30 columns of each output line. If Cylindertransform is 0, GI writes the raw Gauss integrals as output. */ FILE *farrayo, *ferroro; double twoPI=2.0*M_PI; double polyl[MAXLENGTH][3]; /*C-alpha coordinates*/ double unitvectors[MAXLENGTH][MAXLENGTH][3]; double omega[MAXLENGTH][MAXLENGTH]; double absomega[MAXLENGTH][MAXLENGTH]; double partsum[MAXLENGTH][MAXLENGTH]; double abspartsum[MAXLENGTH][MAXLENGTH]; double len; int nres, n_proteins=0; char GlobalOneLetterId[MAXLENGTH]; int SequinceInteger[MAXLENGTH]; double length(double vec[3]); double dot(double vec1[3],double vec2[3]); void geovec(double vec2_minus_vec1[3], double vec1[3], double vec2[3]); void unit(double uvec[3],double vec[3]); void cross(double crossvec[3],double vec1[3],double vec2[3]); double spangle(double vec1[3], double vec2[3], double vec3[3]); void createunitvectors(void); void createomega(void); void createpartsums(void); double mixedsums(int a, int b, int c, int d); double absmixedsums(int a, int b, int c, int d); double int12(void); double inta12(void); double int12_34(void); double inta12_34(void); double int12_a34(void); double inta12_a34(void); double int13_24(void); double inta13_24(void); double int13_a24(void); double inta13_a24(void); double int14_23(void); double inta14_23(void); double int14_a23(void); double inta14_a23(void); double int12_34_56(void); double int12_35_46(void); double int12_36_45(void); double int13_24_56(void); double int13_25_46(void); double int13_26_45(void); double int14_23_56(void); double int14_25_36(void); double int14_26_35(void); double int15_23_46(void); double int15_24_36(void); double int15_26_34(void); double int15_26_34(void); double int16_23_45(void); double int16_24_35(void); double int16_25_34(void); void GaussIntegrals(void); void print_dipeptide_frequeces(void); void print_peptide_sequence(void); int Three2IntegerProteinId(char ThreeLeterAmno[3]); void read_pdb_file(char *dir, char *fname); void dirwalk(char *dir) { char name[1024]; struct dirent *dp; DIR *dfd; if ((dfd=opendir(dir))==NULL) { printf("Sorry - could not open directory <%s>\n",dir); return; } while ((dp=readdir(dfd))!=NULL) { if (strcmp(dp->d_name,".")==0 || strcmp(dp->d_name,"..")==0) continue; sprintf(name,"%s/%s",dir,dp->d_name); if (dp->d_name[strlen(dp->d_name)-4]=='.' && dp->d_name[strlen(dp->d_name)-3]=='p' && dp->d_name[strlen(dp->d_name)-2]=='d' && dp->d_name[strlen(dp->d_name)-1]=='b') { printf("After %i chains, it's %s with ",n_proteins,dp->d_name); read_pdb_file(dir,dp->d_name); } } closedir(dfd); } char line[1000]; int getatom(FILE *fi) { char *cp; while (cp=fgets(line,999,fi),strstr(line,"ATOM")!=line && cp!=NULL); if (cp==NULL) return 0; return 1; } void read_pdb_file(char *dir, char *fname) { FILE *fi; double x, y, z; int nr, no=-1500, link_count=0, start_chain,i; char chain_name, new_chain_name; char ThreeLeterAmno[3]; char dir_fname[1024], *cp; strcpy(dir_fname,dir); strcat(dir_fname,fname); fi=fopen(dir_fname,"r"); chain_name='a'; loop: if (getatom(fi)==0) { if(link_count9) { if(dipeptide==0){fprintf(farrayo,"%s %c %i %i",fname,chain_name,link_count,no-start_chain+1-link_count);} if(dipeptide==1){fprintf(farrayo,"%s ",fname);} nres=link_count; printf("%i residues\n",link_count); GaussIntegrals(); fclose(fi); return; } fclose(fi); return; } sscanf(line+22,"%4i",&nr); sscanf(line+21,"%1c",&new_chain_name); if(new_chain_name==' ') new_chain_name='0'; if (nr!=no && line[13]=='C' && line[14]=='A') { if(new_chain_name!=chain_name && chain_name!='a') { if(link_count9) { if(dipeptide==0){fprintf(farrayo,"%s %c %i %i",fname,chain_name,link_count,no-start_chain+1-link_count);} if(dipeptide==1){fprintf(farrayo,"%s ",fname);} nres=link_count; printf("%i residues ",link_count); GaussIntegrals(); } link_count=0; } link_count++; if (link_count>MAXLENGTH-1) { printf(" >MAXLENGTH=%i residues\n",MAXLENGTH); fprintf(ferroro, "WARNING! Chain %c of <%s> is longer than MAXLENGTH=%i. \n No further attempt to calculate Gauss Integrals for chains of\n <%s> has been made. Consider redefine MAXLENGTH in .\n", chain_name, fname, MAXLENGTH, fname); fclose(fi); return; } no=nr; if(link_count==1) start_chain=no; sscanf(line+30,"%lf %lf %lf",&x,&y,&z); ThreeLeterAmno[0]=line[17]; ThreeLeterAmno[1]=line[18]; ThreeLeterAmno[2]=line[19]; SequinceInteger[link_count]=Three2IntegerProteinId(ThreeLeterAmno); polyl[link_count][0]=x; polyl[link_count][1]=y; polyl[link_count][2]=z; chain_name=new_chain_name; } goto loop; } int Three2IntegerProteinId(char ThreeLeterAmno[3]) { if( strncmp(ThreeLeterAmno,"ALA",3)==0) return 5; if( strncmp(ThreeLeterAmno,"GLU",3)==0) return 4; if( strncmp(ThreeLeterAmno,"GLN",3)==0) return 7; if( strncmp(ThreeLeterAmno,"ASP",3)==0) return 3; if( strncmp(ThreeLeterAmno,"ASN",3)==0) return 6; if( strncmp(ThreeLeterAmno,"LEU",3)==0) return 17; if( strncmp(ThreeLeterAmno,"GLY",3)==0) return 1; if( strncmp(ThreeLeterAmno,"LYS",3)==0) return 10; if( strncmp(ThreeLeterAmno,"SER",3)==0) return 8; if( strncmp(ThreeLeterAmno,"VAL",3)==0) return 13; if( strncmp(ThreeLeterAmno,"ARG",3)==0) return 11; if( strncmp(ThreeLeterAmno,"THR",3)==0) return 9; if( strncmp(ThreeLeterAmno,"PRO",3)==0) return 2; if( strncmp(ThreeLeterAmno,"ILE",3)==0) return 14; if( strncmp(ThreeLeterAmno,"MET",3)==0) return 15; if( strncmp(ThreeLeterAmno,"PHE",3)==0) return 18; if( strncmp(ThreeLeterAmno,"TYR",3)==0) return 19; if( strncmp(ThreeLeterAmno,"CYS",3)==0) return 16; if( strncmp(ThreeLeterAmno,"TRP",3)==0) return 20; if( strncmp(ThreeLeterAmno,"HIS",3)==0) return 12; if( strncmp(ThreeLeterAmno,"ASX",3)==0) return 3; if( strncmp(ThreeLeterAmno,"Glx",3)==0) return 1; if( strncmp(ThreeLeterAmno,"PCA",3)==0) return 8; fprintf(ferroro, "WARNING! Do not know the amino acid residue %c%c%c \n",ThreeLeterAmno[0],ThreeLeterAmno[1],ThreeLeterAmno[2]); return 21; } /* BASIC VECTOR CALCULOS */ /* length: euklidean lenght in R^3*/ double length(double vec[3]) { return sqrt(dot(vec,vec)); } /* dot: dot product in R^3 */ double dot(double vec1[3],double vec2[3]) { return vec1[0]*vec2[0]+ vec1[1]*vec2[1]+ vec1[2]*vec2[2]; } /* unit: gives the unit vector v/|v|, uvec, in R^3 */ void unit(double uvec[3], double vec[3]) { int i; double len; len=length(vec); for (i = 0; i<3;i++) uvec[i]=vec[i]/len; } /* geovec: Gives the vector from vec1 to vec2 */ void geovec(double vec2_minus_vec1[3], double vec1[3], double vec2[3]) { int i; for (i=0; i<3;i++) vec2_minus_vec1[i]=vec2[i]-vec1[i]; } /* cross: gives the cross product, crossvec, of vec1 and vec2 in R^3 */ void cross(double crossvec[3],double vec1[3],double vec2[3]) { crossvec[0]=vec1[1]*vec2[2]-vec2[1]*vec1[2]; crossvec[1]=vec1[2]*vec2[0]-vec2[2]*vec1[0]; crossvec[2]=vec1[0]*vec2[1]-vec2[0]*vec1[1]; } /* cerateunitvectors creates the unit vectors from polyl[i] to polyl[j]. */ void createunitvectors(void) { int i,j; extern double polyl[MAXLENGTH][3]; extern double unitvectors[MAXLENGTH][MAXLENGTH][3]; extern int nres; double varvec[3]; for (i=1;i=0 ) omega[i][j]=twoPI-anglesum; else omega[i][j]=-twoPI-anglesum; absomega[i][j]=fabs(omega[i][j]); } } } } /* createpartsums creats partsum[i][j]=writhe of the segment * from polyl[i] to polyl[j+1] */ void createpartsums(void) { extern double omega[MAXLENGTH][MAXLENGTH]; extern double absomega[MAXLENGTH][MAXLENGTH]; extern int nres; extern double partsum[MAXLENGTH][MAXLENGTH]; extern double abspartsum[MAXLENGTH][MAXLENGTH]; int a, k; for (k=-1;k<2;k++) for (a=1;a \n",argv[3]); return 1; } farrayo=fopen(argv[2],"w"); if (farrayo==(FILE *)NULL) { printf("Sorry - could not open file <%s>\n",argv[2]); return 1; } dirwalk(argv[1]); printf("There has been calculated 29 Gauss-integrals for %i protein chains.\n", n_proteins); fclose(farrayo); fclose(ferroro); return 0; }