Code:sr28-collate.c

From the change wiki

This program converts USDA nutrition data into an easier table to run SQL queries on.

// sr28-collate.c
// This program converts USDA nutrition data into an easier table to run SQL queries on.
// Input: sr28asc.zip (USDA Standard Reference [SR Legacy] ascii version 28)
// Output: sr28-collated.csv
// Instructions:
// - Unzip sr28asc.zip to subfolder: data/sr28asc/
// - Run this program: type into a Linux terminal:  gcc sr28-collate.c -O1 && ./a.out
// - When it finishes, sr28-collated.csv will appear in the subfolder 'data'.
// Author: Elie Goldman Smith <elie@olam.wiki>
// License: Creative Commons License

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define OUTPUT_FILENAME "data/sr28-collated.csv"
#define MAXNUTRIENTS  888 // all nutrient id's must be smaller than this
#define MAXFOODS    94444 // all food id's must be smaller than this
int nNutrients=0;
int nFoods=0;
char *food_lines[MAXFOODS] = {NULL};
char *nutrient_names[MAXNUTRIENTS] = {NULL};
char *nutrient_units[MAXNUTRIENTS] = {NULL};
char **food_nutrients[MAXFOODS] = {NULL};
FILE *_out;


// Loads an entire file into memory (uses malloc)
char *read_all(const char *filename, int *size)
{
 char *data = NULL;
 FILE *f = fopen(filename, "r");
 if (f) {
  fseek(f,0,SEEK_END);
  *size = ftell(f);
  fseek(f,0,SEEK_SET);
  data = malloc(*size+1);
  if (data) {
   printf("Loading %s (%d bytes)\n", filename, *size);
   if (!fread(data, *size, 1, f)) { printf("^ failed\n"); *size=0; }
   data[*size] = '\0';
  }
  else printf("Failed to allocate memory (%d bytes) for %s\n", *size+1, filename);
  fclose(f);
 }
 else { perror(filename); *size=0; }
 return data;
}



// Writes out the contents of a "cell" from an input file's data
//   input example: ~foo~
//   output example: foo
// param: p: pointer to the beginning of the cell
void output_cell(const char *p) {
 while (*p && (*p=='~' || *p=='^' || *p=='\r' || *p=='\n')) p++;// skip special chars
 while (*p &&  *p!='~' && *p!='^' && *p!='\r' && *p!='\n') {   // output string
  if (*p=='\"') fputs("\"\"",_out);                           // escape quotes
  else          fputc(*p,    _out);
  p++;
 }
}

// Writes out a line from an input file's data
//  input example:  ~foo~^~bar~^~baz~   (with newline '\n' at the end)
//  output example: "foo","bar","baz"   (no newline)
// param: p: pointer to the beginning of the line
// return value: number of cells written
int output_line(const char *p) {
 int n=0;
 while (*p && *p != '\r' && *p != '\n') {
  if(n==0)n=1;
  if     (*p=='^'){fputc(',',   _out);n++;}
  else if(*p=='~') fputc('\"',  _out);
  else if(*p=='\"')fputs("\"\"",_out);
  else             fputc(*p,    _out);
  p++;
 }
 return n;
}




int main() {
 // load files
 int size1, size2, size3;
 char *food_des = read_all("data/sr28asc/FOOD_DES.txt", &size1);
 char *nutr_def = read_all("data/sr28asc/NUTR_DEF.txt", &size2);
 char *nut_data = read_all("data/sr28asc/NUT_DATA.txt", &size3);
 if (nutr_def && food_des && nut_data) {
  printf("Parsing input files...\n"); // XXX: question for parser code below: is atoi() guaranteed to stop reading the string after the first non-numeric char (other than a minus sign at the start)? if not, then in theory, then some implementations might try to go all the way to the end of the long ass string every time it's called. this would make the program run extremely slow.
  int alloc_ok = 1;

  // Food descriptions
  char *end = food_des+size1;
  for (char *p=food_des; p<end; p++) {
   char *line = p;
   while (p<end &&(*p < '0' || *p > '9')) p++;// skip to next numeric char
   int fid = atoi(p);                         // get food id
   if (fid >= 0 && fid < MAXFOODS) {
    nFoods++;
    food_lines[fid] = line;                   // save pointer to line of text
    if (alloc_ok) {                           // allocate memory for food nutrient amounts
     food_nutrients[fid] = calloc(MAXNUTRIENTS,sizeof(char*));
     if (!food_nutrients[fid]) {
      printf("Not enough memory. Some foods will be missing.\n");
      alloc_ok = 0;
     }
    }
   }
   while (p<end && *p != '\n') p++;           // skip to next line
  }

  // Nutrient definitions
  end = nutr_def+size2;
  for (char *p=nutr_def; p<end; p++) {
   while (p<end &&(*p < '0' || *p > '9')) p++;// skip to next numeric char
   int nid = atoi(p);                         // get nutrient id
   if (nid >= 0 && nid < MAXNUTRIENTS) {
    while(p<end && *p != '^') p++;            // skip to next carat
    if   (p<end) nutrient_units[nid] = ++p;   // get nutrient unitname pointer
    while(p<end && *p != '^') p++;            // skip to next carat
    if   (p<end) p++;                         // next char
    while(p<end && *p != '^') p++;            // skip to next carat
    if   (p<end) nutrient_names[nid] = ++p;   // get nutrient name pointer
    nNutrients++;
   }
   while (p<end && *p != '\n') p++;           // skip to next line
  }

  // Nutrition data of foods
  printf("%d nutrients, %d foods\n", nNutrients, nFoods);
  end = nut_data+size3;
  for (char *p=nut_data; p<end; p++) {
   while (p<end &&(*p < '0' || *p > '9')) p++;// skip to next numeric char
   int   fid = atoi(p);                       // get food id
   while (p<end && *p != '^') p++;            // skip to next carat
   while (p<end &&(*p < '0' || *p > '9')) p++;// skip to next numeric char
   int   nid = atoi(p);                       // get nutrient id
   if (fid >= 0 && fid < MAXFOODS && nid >= 0 && nid < MAXNUTRIENTS) {
    while(p<end && *p != '^') p++;            // skip to next carat
    while(p<end &&(*p < '0' || *p > '9') && *p != '.') p++; // skip to next numeric char
    food_nutrients[fid][nid] = p;             // get pointer to 'amount' of nutrient in food
   }
   while (p<end && *p != '\n') p++;           // skip to next line
  }

  // Output
  _out = fopen(OUTPUT_FILENAME, "w");
  if (_out) {
   printf("Writing to %s...\n", OUTPUT_FILENAME);
   // header
   fputs("\"NDB_No\",\"FdGrp_Cd\",\"Long_Desc\",\"Shrt_Desc\",\"ComName\",\"ManufacName\",\"Survey\",\"Ref_desc\",\"Refuse\",\"SciName\",\"N_Factor\",\"Pro_Factor\",\"Fat_Factor\",\"CHO_Factor\"",_out);
   for (int i=0; i<MAXNUTRIENTS; i++) if (nutrient_names[i]) {
    fputs(",\"", _out);
    output_cell(nutrient_names[i]);
    fputs(" (",  _out);
    output_cell(nutrient_units[i]);
    fputs(")\"", _out);
   }
   fputc('\n',   _out);
   // data
   int nLines=0;
   for (int i=0; i<MAXFOODS; i++) if (food_lines[i]) {
    int n = output_line(food_lines[i]);
    if (n != 14) printf("WTF? Food should have 14 fields, not %d.\n", n); // sanity check
    for (int j=0; j<MAXNUTRIENTS; j++) if (nutrient_names[j]) {
     fputs(",\"",_out);
     if (food_nutrients[i][j]) output_cell(food_nutrients[i][j]);
     fputc('\"', _out);
    }
    fputc('\n',  _out);
    if (++nLines % 100 == 0) printf("Wrote %d foods\n", nLines);
   }
   printf("Wrote %d foods\n", nLines);
   if (nLines==nFoods) printf("Done.\n");
   fclose(_out);
  }
  else perror("Cannot write to "OUTPUT_FILENAME);
 }
 free(food_des);
 free(nutr_def);
 free(nut_data);
 for (int i=0; i<MAXNUTRIENTS; i++) if (food_nutrients[i]) free(food_nutrients[i]);
 return 0;
}