/*------------------------------------------------------------------*\ | Routines to read and apply transformations to data bases. | | | | Peter N. Schweitzer (U.S. Geological Survey, Reston, VA 22092) | \*------------------------------------------------------------------*/ #include #include #include #include #include #include "analog.h" #ifdef THINK_C extern FILE *Mac_fopen (char *name, char *mode); #define FOPEN Mac_fopen #else #define FOPEN fopen #endif enum word_code { Wnew, Wadd, Wsubtract, Wmultiply, Wdivide, Wignore, Wall, Wby, WLBrace, WRBrace, WIdentifier, WNULL }; struct word { char *name; enum word_code code; }; static int word_list_count = 0; static struct word word_list[] = { {"new", Wnew }, {"add", Wadd }, {"subtract", Wsubtract }, {"sub", Wsubtract }, {"multiply", Wmultiply }, {"mul", Wmultiply }, {"divide", Wdivide }, {"div", Wdivide }, {"ignore", Wignore }, {"all", Wall }, {"by", Wby }, {NULL, WNULL } }; static int compare_words (const void *e1, const void *e2) { struct word *w1, *w2; w1 = (struct word *) e1; w2 = (struct word *) e2; return (stricmp (w1->name,w2->name)); } /*----------------------------------------------------------------------*\ \*----------------------------------------------------------------------*/ enum statistic_code { Wvalue, Wcount, Wsum, Wssq, Wmean, Wvar, Wsdev }; struct statistic_list { char *name; enum statistic_code code; }; static struct statistic_list stat_list[] = { {"count", Wcount, }, {"sum", Wsum, }, {"ssq", Wssq, }, {"mean", Wmean, }, {"var", Wvar, }, {"sdev", Wsdev }, {NULL, Wvalue } }; /*----------------------------------------------------------------------*\ \*----------------------------------------------------------------------*/ struct token { enum word_code code; char string[MAX_NAME_LENGTH]; int line_number; }; static struct token *token = NULL; static int token_count = 0; static void find_tokens (char *job_file) { int n; FILE *in; char line [MAX_CONFIG_LINE_LEN]; int line_count = 0; char *s; struct word w,*match; char string[MAX_NAME_LENGTH]; char *b, *e; int token_limit = 0; if (in = FOPEN (job_file,"r")) { if (!word_list_count) { for (word_list_count=0; word_list[word_list_count].name; word_list_count++); qsort (word_list,word_list_count,sizeof(struct word),compare_words); } if (!(token = (struct token *) malloc (GRANULARITY * sizeof (struct token)))) { sprintf (message,"Error: could not allocate space for token array"); error_exit (message); } token_count = 0; token_limit = GRANULARITY; while (fgets (line,MAX_CONFIG_LINE_LEN,in)) { line_count++; if (s = strrchr (line,'\n')) *s = 0; if (s = strrchr (line,'\r')) *s = 0; s = line; while (*s && isspace (*s)) s++; /*------------------------------------------------------*\ | Read the tokens and put in the array token. | \*------------------------------------------------------*/ while (*s && *s != '#') { while (*s && isspace (*s)) s++; if (*s) switch (*s) { case '{': token[token_count].code = WLBrace; strcpy (token[token_count].string,"{"); token[token_count].line_number = line_count; token_count++; s++; break; case '}': token[token_count].code = WRBrace; strcpy (token[token_count].string,"}"); token[token_count].line_number = line_count; token_count++; s++; break; default: b = s; if (e = strpbrk (s," \t{#}")) { memcpy (string,b,e-b); string[e-b] = 0; } else strcpy (string,b); if (*string) { w.name = string; if (match = (struct word *) bsearch (&w,word_list,word_list_count,sizeof(struct word),compare_words)) { token[token_count].code = match->code; strcpy (token[token_count].string,string); token[token_count].line_number = line_count; } else { token[token_count].code = WIdentifier; strcpy (token[token_count].string,string); token[token_count].line_number = line_count; } token_count++; } if (e) s = e; else s = b + strlen(b); break; } if (token_count >= token_limit) { if (!(token = realloc (token,(token_limit + GRANULARITY)*sizeof(struct token)))) { sprintf (message,"Error: could not enlarge token array to %d elements",token_limit + GRANULARITY); error_exit (message); } token_limit += GRANULARITY; } } } fclose (in); } else { sprintf (message,"Warning: could not open rule file %s",job_file); warning (message); } } struct taxon_list { struct taxon_list *next; char name[MAX_NAME_LENGTH]; int index; }; struct create_list { struct create_list *next; char name[MAX_NAME_LENGTH]; struct taxon_list *add; }; struct operation { struct operation *next; enum word_code code; enum statistic_code stat; double value; }; static struct create_list *new_taxon = NULL; static struct taxon_list *ignore = NULL; static struct operation *op = NULL; static int increment (int i) { i++; if (i >= token_count) { sprintf (message,"Error: unexpected end of rule file"); error_exit (message); } return (i); } void read_rules (struct data_base *p) { int i,j; int number; char *s; struct create_list *c = NULL; struct taxon_list *t = NULL; struct taxon_list *d = NULL; struct operation *q = NULL; if (*p->rule.filespec == 0) return; find_tokens (p->rule.filespec); i = 0; while (i < token_count) { switch (token[i].code) { /*----------------------------------------------------------*\ | new taxon_name { | | add taxon_name | | ... | | add taxon_name | | } | \*----------------------------------------------------------*/ case Wnew: /*------------------------------------------------------*\ | The name of the new taxon is required; at this point | | we make a create_list item for the new taxon. | \*------------------------------------------------------*/ i = increment (i); if (token[i].code == WIdentifier) { if (!c) if (new_taxon = (struct create_list *) malloc (sizeof (struct create_list))) { c = new_taxon; c->next = NULL; strcpy (c->name,token[i].string); /*------------------------------------------*\ | Replace '_' with ' ' in the name. This | | reverses the operation that the user had | | to do so that the parser (code above) | | would recognize a taxon name as a single | | token rather than several, if the name | | contained spaces. This line occurs each | | time a name is copied from the token | | string in this routine. | \*------------------------------------------*/ for (s=c->name; *s; s++) if (*s == '_') *s = ' '; c->add = NULL; } else { sprintf (message,"Error: could not allocate create list head"); error_exit (message); } else { if (c->next = (struct create_list *) malloc (sizeof (struct create_list))) { c = c->next; c->next = NULL; strcpy (c->name,token[i].string); for (s=c->name; *s; s++) if (*s == '_') *s = ' '; c->add = NULL; } else { sprintf (message,"Error: could not allocate create list element"); error_exit (message); } } } else { sprintf (message,"Error: expected new variable name, got \"%s\" in line %d of rule file %s", token[i].string, token[i].line_number, p->rule.filespec ); error_exit (message); } t = NULL; /*------------------------------------------------------*\ | Next come left brace, a sequence of add statements, | | followed by right brace. | \*------------------------------------------------------*/ i = increment (i); if (token[i].code == WLBrace) { i = increment (i); while (token[i].code != WRBrace) { switch (token[i].code) { /*------------------------------------------*\ | Each add statement requires a taxon_list | | entry. | \*------------------------------------------*/ case Wadd: i = increment (i); if (token[i].code == WIdentifier) { if (!t) if (c->add = (struct taxon_list *) malloc (sizeof (struct taxon_list))) { t = c->add; strcpy (t->name,token[i].string); for (s=t->name; *s; s++) if (*s == '_') *s = ' '; t->index = -1; t->next = NULL; } else { sprintf (message,"Error: could not allocate taxon list head"); error_exit (message); } else if (t->next = (struct taxon_list *) malloc (sizeof (struct taxon_list))) { t = t->next; strcpy (t->name,token[i].string); for (s=t->name; *s; s++) if (*s == '_') *s = ' '; t->index = -1; t->next = NULL; } else { sprintf (message,"Error: could not allocate taxon list element"); error_exit (message); } i = increment (i); } else { sprintf (message,"Error: expected variable name, got \"%s\" in line %d of %s", token[i].string, token[i].line_number, p->rule.filespec ); error_exit (message); } break; /*------------------------------------------*\ | } | \*------------------------------------------*/ case WRBrace: break; /*------------------------------------------*\ | Anything else is an error | \*------------------------------------------*/ default: sprintf (message,"Error: unexpected \"%s\" in line %d of %s", token[i].string, token[i].line_number, p->rule.filespec ); error_exit (message); break; } } } else { sprintf (message,"Error: expected '{', got \"%s\" on line %d of %s", token[i].string, token[i].line_number, p->rule.filespec ); error_exit (message); } if (token[i].code == WRBrace) i++; break; /*----------------------------------------------------------*\ | ignore taxon_name | \*----------------------------------------------------------*/ case Wignore: i = increment (i); if (token[i].code == WIdentifier) { if (!d) if (ignore = (struct taxon_list *) malloc (sizeof (struct taxon_list))) { d = ignore; strcpy (d->name,token[i].string); d->next = NULL; d->index = -1; for (s=d->name; *s; s++) if (*s == '_') *s = ' '; } else { sprintf (message,"Error: could not allocate ignore list head"); error_exit (message); } else if (d->next = (struct taxon_list *) malloc (sizeof (struct taxon_list))) { d = d->next; strcpy (d->name,token[i].string); for (s=d->name; *s; s++) if (*s == '_') *s = ' '; d->index = -1; d->next = NULL; } else { sprintf (message,"Error: could not allocate ignore list element"); error_exit (message); } i++; } else { sprintf (message,"Error: expected variable name, got \"%s\" in line %d of %s", token[i].string, token[i].line_number, p->rule.filespec ); error_exit (message); } break; /*----------------------------------------------------------*\ | {add | sub | mul | div} [all] [by] {number | sum} | \*----------------------------------------------------------*/ case Wadd: case Wsubtract: case Wmultiply: case Wdivide: if (q) if (q->next = (struct operation *) malloc (sizeof (struct operation))) { q = q->next; q->next = NULL; q->code = token[i].code; q->stat = Wvalue; q->value = 0.0; } else { sprintf (message,"Error: could not allocate operation list element"); error_exit (message); } else if (op = (struct operation *) malloc (sizeof (struct operation))) { q = op; q->next = NULL; q->code = token[i].code; q->stat = Wvalue; q->value = 0.0; } else { sprintf (message,"Error: could not allocate operation list head"); error_exit (message); } i = increment (i); if (token[i].code == Wall) i = increment (i); if (token[i].code == Wby) i = increment (i); switch (token[i].code) { case WIdentifier: /*----------------------------------------------*\ | Determine whether the identifier is one of | | the reserved words count,sum,ssq,mean,var, | | or sdev. If so, the value will be computed | | for each sample when the rule is applied. | \*----------------------------------------------*/ for (j=0; stat_list[j].name; j++) if (stricmp (token[i].string,stat_list[j].name) == 0) { q->stat = stat_list[j].code; q->value = 0.0; i++; break; } /*----------------------------------------------*\ | If the identifier is not one of the reserved | | statistic names, assume it is a number and | | use the number. | \*----------------------------------------------*/ if (stat_list[j].name == NULL) { q->stat = Wvalue; q->value = strtod (token[i].string,&s); if (s == token[i].string) { sprintf (message,"Error: expected number, got \"%s\" in line %d of %s", token[i].string, token[i].line_number, p->rule.filespec ); error_exit (message); } i++; } break; default: sprintf (message,"Error: unexpected \"%s\" in line %d of %s", token[i].string, token[i].line_number, p->rule.filespec ); error_exit (message); break; } break; default: sprintf (message,"Error: unexpected \"%s\" in line %d of %s", token[i].string, token[i].line_number, p->rule.filespec ); error_exit (message); break; } } if (token) { free (token); token = NULL; token_count = 0; } } /*----------------------------------------------------------------------*\ | Procedure for applying rules: | | | \*----------------------------------------------------------------------*/ struct index_link { char *name; int raw_data_column; }; static int index_link_compare (const void *e1, const void *e2) { struct index_link *L1,*L2; L1 = (struct index_link *) e1; L2 = (struct index_link *) e2; return (strcmp (L1->name,L2->name)); } void apply_rules (struct data_base *p) { int i,j,k,n; int copy_count,new_count; struct create_list *c = NULL; struct taxon_list *t = NULL; struct taxon_list *d = NULL; struct operation *q = NULL; struct index_link *link; double sum,ssq; char *s; /*------------------------------------------------------------------*\ | Count the number of raw taxa to be copied. This is p->raw.count | | minus the number of entries in the ignore list that are also in | | p->raw.name. | \*------------------------------------------------------------------*/ copy_count = 0; for (i=0; i < p->raw.count; i++) { /*--------------------------------------------------------------*\ | If this name does not appear in the ignore list, increment | | copy_count. Otherwise, note in the ignore list element what | | index the name has in the name array. | \*--------------------------------------------------------------*/ for (d=ignore; d; d=d->next) if (strcmp (p->raw.name[i],d->name) == 0) { d->index = i; break; } if (d == NULL) copy_count++; } /*------------------------------------------------------------------*\ | Count the number of new taxa to be created. Check the new taxon | | names against those already in the name array and against those | | in the ignore list. Note the index of each raw taxa that are | | added to make each new taxon. | \*------------------------------------------------------------------*/ new_count = 0; for (c=new_taxon; c; c=c->next) { for (i=0; i < p->raw.count; i++) if (strcmp (c->name,p->raw.name[i]) == 0) { sprintf (message,"Error: new variable \"%s\" already exists in data file %s", c->name, p->raw.filespec ); error_exit (message); } for (d=ignore; d; d=d->next) if (strcmp (c->name,d->name) == 0) { sprintf (message,"Error: You asked to create AND ignore new variable \"%s\" in rule file %s", c->name, p->rule.filespec ); error_exit (message); } for (t=c->add; t; t=t->next) { for (i=0; i < p->raw.count; i++) if (strcmp (p->raw.name[i],t->name) == 0) { t->index = i; break; } if (i == p->raw.count) { sprintf (message,"Warning: new variable \"%s\" refers to old variable \"%s\" which does not occur in data file %s", c->name, t->name, p->raw.filespec ); warning (message); t->index = -1; } } new_count++; } /*------------------------------------------------------------------*\ | For debugging purposes, print out the rules. | \*------------------------------------------------------------------*/ if (verbose) { if (new_taxon || ignore || op) printf ("Summary of rules read from %s\n",p->rule.filespec); if (new_taxon) { printf (" New variables to be created:\n"); for (c=new_taxon; c; c=c->next) { printf (" name: \"%s\"\n",c->name); for (t=c->add; t; t=t->next) printf (" add %3d, \"%s\"\n",t->index,t->name); } } if (ignore) { printf (" Existing taxa to be ignored:\n"); for (t=ignore; t; t=t->next) printf (" ignore %3d, \"%s\"\n",t->index,t->name); } if (op) { printf (" Operations to be performed on each sample:\n"); for (q=op; q; q=q->next) { char op_name[4]; switch (q->code) { case Wadd: strcpy (op_name,"add"); break; case Wsubtract: strcpy (op_name,"sub"); break; case Wmultiply: strcpy (op_name,"mul"); break; case Wdivide: strcpy (op_name,"div"); break; default: sprintf (op_name,"%.3d",q->code); break; } switch (q->stat) { case Wcount: printf (" %s by count\n",op_name); break; case Wsum: printf (" %s by sum\n",op_name); break; case Wssq: printf (" %s by ssq\n",op_name); break; case Wmean: printf (" %s by mean\n",op_name); break; case Wvar: printf (" %s by var\n",op_name); break; case Wsdev: printf (" %s by sdev\n",op_name); break; case Wvalue: printf (" %s by %lf\n",op_name,q->value); break; default: break; } } } } /*------------------------------------------------------------------*\ | Tell the user which names in the ignore list are not in the data | | file. | \*------------------------------------------------------------------*/ for (d=ignore; d; d=d->next) if (d->index == -1) { sprintf (message,"Warning: rule file %s says to ignore \"%s\", which isn't in %s", p->rule.filespec, d->name, p->raw.filespec ); warning (message); } /*------------------------------------------------------------------*\ | Copy the data from raw to data so that the variables appear in | | data sorted according to their names. | | | | This is accomplished by creating an array of structures called | | index_link, each of which contains the name of a variable and | | the index into p->raw.name where that name is found. If the | | variable is new (created by a rule), the index is -1. | | | | First you have to create the array of index_link structures, then| | you have to sort this array so that the names are in sequence. | | The reason why they are structures is that the index information | | must tag along with the name when the names are moved around. | | | \*------------------------------------------------------------------*/ p->data.count = copy_count + new_count; if (link = (struct index_link *) malloc (p->data.count * sizeof(struct index_link))) { j = 0; for (i=0; i < p->raw.count; i++) { for (d=ignore; d; d=d->next) if (strcmp (p->raw.name[i],d->name) == 0) break; if (d == NULL) { link[j].raw_data_column = i; link[j].name = p->raw.name[i]; j++; } } for (c=new_taxon; c; c=c->next) { link[j].raw_data_column = -1; link[j].name = c->name; j++; } qsort (link,p->data.count,sizeof(struct index_link),index_link_compare); #ifdef DEBUG printf ("Link array:\n"); for (j=0; j < p->data.count; j++) printf (" %2d: (%2d) \"%s\"\n",j,link[j].raw_data_column,link[j].name); #endif /*--------------------------------------------------------------*\ | The link array is sorted. Create the array of pointers to | | the names of variables that are not ignored or are new. | | These variables will be stored in array p->sample[i].data. | \*--------------------------------------------------------------*/ if (p->data.name = (char **) malloc (p->data.count * sizeof(char *))) { /*----------------------------------------------------------*\ | Compute size for and allocate the data name buffer, fill | | the buffer and assign pointers | \*----------------------------------------------------------*/ for (j=0,n=0; j < p->data.count; j++) n += 1 + strlen (link[j].name); if (!(p->data.name_buffer = (char *) malloc (n))) { sprintf (message,"Error: could not allocate data name buffer"); error_exit (message); } s = p->data.name_buffer; for (j=0; j < p->data.count; j++) { strcpy (s,link[j].name); p->data.name[j] = s; s += 1 + strlen (s); } /*----------------------------------------------------------*\ | For each sample in the data base, transfer to the array | | data all values in the array raw in the order given by | | the array link. Since link does not contain variables | | that the user wanted to ignore, this has the effect of | | forgetting those variables. | | | | The link array also includes those variables specified | | as new; their values must be computed for each sample. | \*----------------------------------------------------------*/ for (i=0; i < p->count; i++) /* for each sample */ if (p->sample[i].data = (double *) malloc (p->data.count * sizeof(double))) { for (j=0; j < p->data.count; j++) { /*----------------------------------------------*\ | For each variable to be stored in data, find | | out if the variable is to be copied directly | | from the raw data. This is true if the | | structure member raw_data_column is some | | positive number or zero, not -1. | \*----------------------------------------------*/ if (link[j].raw_data_column >= 0) p->sample[i].data[j] = p->sample[i].raw[link[j].raw_data_column]; else { /* link[j].raw_data_column == -1 */ int ok; /*------------------------------------------*\ | This is a new variable, and you have to | | find the data from the taxon list of the | | new taxon whose name matches link[j].name. | | | If any of the raw values for this taxon | | are MISSING_VALUE, don't include them; | | if all are missing, set the resulting | | data value to MISSING_VALUE. | \*------------------------------------------*/ for (c=new_taxon; c; c=c->next) if (strcmp (c->name,link[j].name) == 0) break; p->sample[i].data[j] = 0.0; ok = 0; for (t=c->add; t; t=t->next) if (p->sample[i].raw[t->index] != MISSING_VALUE) { p->sample[i].data[j] += p->sample[i].raw[t->index]; ok = 1; } if (!ok) p->sample[i].data[j] = MISSING_VALUE; } } /*--------------------------------------------------*\ | Now p->sample[i].data contains the proper values | | from p->sample[i].raw, and we have no more need | | for p->sample[i].raw. Release its space. | \*--------------------------------------------------*/ free (p->sample[i].raw); p->sample[i].raw = NULL; /*--------------------------------------------------*\ | Apply arithmetic operations to p->sample[i].data | | | | Quandary: should the statistics be calculated | | before all operations or before each operation | | that refers to either one? I choose the former. | \*--------------------------------------------------*/ k = 0; sum = 0.0; ssq = 0.0; for (j=0; j < p->data.count; j++) if (p->sample[i].data[j] != MISSING_VALUE) { double a = p->sample[i].data[j]; k++; sum += a; ssq += a*a; } for (q=op; q; q=q->next) { switch (q->stat) { case Wcount: q->value = (double) k; break; case Wsum: q->value = sum; break; case Wssq: q->value = ssq; break; case Wmean: q->value = sum/(double)k; break; case Wvar: q->value = (ssq - sum*sum/((double)k))/(double)k; break; case Wsdev: q->value = sqrt ((ssq - sum*sum/((double)k))/(double)k); break; default: break; } switch (q->code) { case Wadd: for (j=0; j < p->data.count; j++) if (p->sample[i].data[j] != MISSING_VALUE) p->sample[i].data[j] += q->value; break; case Wsubtract: for (j=0; j < p->data.count; j++) if (p->sample[i].data[j] != MISSING_VALUE) p->sample[i].data[j] -= q->value; break; case Wmultiply: for (j=0; j < p->data.count; j++) if (p->sample[i].data[j] != MISSING_VALUE) p->sample[i].data[j] *= q->value; break; case Wdivide: for (j=0; j < p->data.count; j++) if (p->sample[i].data[j] != MISSING_VALUE) p->sample[i].data[j] /= q->value; break; default: break; } } /*--------------------------------------------------*\ | For debugging purposes, output the data array. | \*--------------------------------------------------*/ #ifdef DEBUG printf ("%s",p->sample[i].id); for (j=0; j < p->data.count; j++) printf ("\t%5.2lf",p->sample[i].data[j]); printf ("\n"); #endif } else { sprintf (message,"Error: could not allocate data array"); error_exit (message); } } else { sprintf (message,"Error: could not allocate data name array"); error_exit (message); } free (link); } else { sprintf (message,"Error: could not allocate index link array"); error_exit (message); } /*------------------------------------------------------------------*\ | Release memory held by the rules. | \*------------------------------------------------------------------*/ if (d = ignore) do { t = d; d = d->next; free (t); } while (d); ignore = NULL; for (c=new_taxon; c; c=c->next) if (d = c->add) do { t = d; d = d->next; free (t); } while (d); if (c = new_taxon) { struct create_list *cc; do { cc = c; c = c->next; free (cc); } while (c); } new_taxon = NULL; if (q = op) { struct operation *qq; do { qq = q; q = q->next; free (qq); } while (q); } op = NULL; } /*----------------------------------------------------------------------*\ \*----------------------------------------------------------------------*/