Code:sr28-collate.c
This program converts USDA nutrition data into an easier table to run SQL queries on.
- Input: sr28asc.zip (must be unzipped in a subfolder
data/sr28asc/
) - Output: sr28-collated.csv
// sr28-collate.c
// This program converts USDA nutrition data into an easier table to run SQL queries on.
// Input: sr28asc.zip (USDA Standard Reference [SR Legacy] ascii version 28)
// Output: sr28-collated.csv
// Instructions:
// - Unzip sr28asc.zip to subfolder: data/sr28asc/
// - Run this program: type into a Linux terminal: gcc sr28-collate.c -O1 && ./a.out
// - When it finishes, sr28-collated.csv will appear in the subfolder 'data'.
// Author: Elie Goldman Smith <elie@olam.wiki>
// License: Creative Commons License
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define OUTPUT_FILENAME "data/sr28-collated.csv"
#define MAXNUTRIENTS 888 // all nutrient id's must be smaller than this
#define MAXFOODS 94444 // all food id's must be smaller than this
int nNutrients=0;
int nFoods=0;
char *food_lines[MAXFOODS] = {NULL};
char *nutrient_names[MAXNUTRIENTS] = {NULL};
char *nutrient_units[MAXNUTRIENTS] = {NULL};
char **food_nutrients[MAXFOODS] = {NULL};
FILE *_out;
// Loads an entire file into memory (uses malloc)
char *read_all(const char *filename, int *size)
{
char *data = NULL;
FILE *f = fopen(filename, "r");
if (f) {
fseek(f,0,SEEK_END);
*size = ftell(f);
fseek(f,0,SEEK_SET);
data = malloc(*size+1);
if (data) {
printf("Loading %s (%d bytes)\n", filename, *size);
if (!fread(data, *size, 1, f)) { printf("^ failed\n"); *size=0; }
data[*size] = '\0';
}
else printf("Failed to allocate memory (%d bytes) for %s\n", *size+1, filename);
fclose(f);
}
else { perror(filename); *size=0; }
return data;
}
// Writes out the contents of a "cell" from an input file's data
// input example: ~foo~
// output example: foo
// param: p: pointer to the beginning of the cell
void output_cell(const char *p) {
while (*p && (*p=='~' || *p=='^' || *p=='\r' || *p=='\n')) p++;// skip special chars
while (*p && *p!='~' && *p!='^' && *p!='\r' && *p!='\n') { // output string
if (*p=='\"') fputs("\"\"",_out); // escape quotes
else fputc(*p, _out);
p++;
}
}
// Writes out a line from an input file's data
// input example: ~foo~^~bar~^~baz~ (with newline '\n' at the end)
// output example: "foo","bar","baz" (no newline)
// param: p: pointer to the beginning of the line
// return value: number of cells written
int output_line(const char *p) {
int n=0;
while (*p && *p != '\r' && *p != '\n') {
if(n==0)n=1;
if (*p=='^'){fputc(',', _out);n++;}
else if(*p=='~') fputc('\"', _out);
else if(*p=='\"')fputs("\"\"",_out);
else fputc(*p, _out);
p++;
}
return n;
}
int main() {
// load files
int size1, size2, size3;
char *food_des = read_all("data/sr28asc/FOOD_DES.txt", &size1);
char *nutr_def = read_all("data/sr28asc/NUTR_DEF.txt", &size2);
char *nut_data = read_all("data/sr28asc/NUT_DATA.txt", &size3);
if (nutr_def && food_des && nut_data) {
printf("Parsing input files...\n"); // XXX: question for parser code below: is atoi() guaranteed to stop reading the string after the first non-numeric char (other than a minus sign at the start)? if not, then in theory, then some implementations might try to go all the way to the end of the long ass string every time it's called. this would make the program run extremely slow.
int alloc_ok = 1;
// Food descriptions
char *end = food_des+size1;
for (char *p=food_des; p<end; p++) {
char *line = p;
while (p<end &&(*p < '0' || *p > '9')) p++;// skip to next numeric char
int fid = atoi(p); // get food id
if (fid >= 0 && fid < MAXFOODS) {
nFoods++;
food_lines[fid] = line; // save pointer to line of text
if (alloc_ok) { // allocate memory for food nutrient amounts
food_nutrients[fid] = calloc(MAXNUTRIENTS,sizeof(char*));
if (!food_nutrients[fid]) {
printf("Not enough memory. Some foods will be missing.\n");
alloc_ok = 0;
}
}
}
while (p<end && *p != '\n') p++; // skip to next line
}
// Nutrient definitions
end = nutr_def+size2;
for (char *p=nutr_def; p<end; p++) {
while (p<end &&(*p < '0' || *p > '9')) p++;// skip to next numeric char
int nid = atoi(p); // get nutrient id
if (nid >= 0 && nid < MAXNUTRIENTS) {
while(p<end && *p != '^') p++; // skip to next carat
if (p<end) nutrient_units[nid] = ++p; // get nutrient unitname pointer
while(p<end && *p != '^') p++; // skip to next carat
if (p<end) p++; // next char
while(p<end && *p != '^') p++; // skip to next carat
if (p<end) nutrient_names[nid] = ++p; // get nutrient name pointer
nNutrients++;
}
while (p<end && *p != '\n') p++; // skip to next line
}
// Nutrition data of foods
printf("%d nutrients, %d foods\n", nNutrients, nFoods);
end = nut_data+size3;
for (char *p=nut_data; p<end; p++) {
while (p<end &&(*p < '0' || *p > '9')) p++;// skip to next numeric char
int fid = atoi(p); // get food id
while (p<end && *p != '^') p++; // skip to next carat
while (p<end &&(*p < '0' || *p > '9')) p++;// skip to next numeric char
int nid = atoi(p); // get nutrient id
if (fid >= 0 && fid < MAXFOODS && nid >= 0 && nid < MAXNUTRIENTS) {
while(p<end && *p != '^') p++; // skip to next carat
while(p<end &&(*p < '0' || *p > '9') && *p != '.') p++; // skip to next numeric char
food_nutrients[fid][nid] = p; // get pointer to 'amount' of nutrient in food
}
while (p<end && *p != '\n') p++; // skip to next line
}
// Output
_out = fopen(OUTPUT_FILENAME, "w");
if (_out) {
printf("Writing to %s...\n", OUTPUT_FILENAME);
// header
fputs("\"NDB_No\",\"FdGrp_Cd\",\"Long_Desc\",\"Shrt_Desc\",\"ComName\",\"ManufacName\",\"Survey\",\"Ref_desc\",\"Refuse\",\"SciName\",\"N_Factor\",\"Pro_Factor\",\"Fat_Factor\",\"CHO_Factor\"",_out);
for (int i=0; i<MAXNUTRIENTS; i++) if (nutrient_names[i]) {
fputs(",\"", _out);
output_cell(nutrient_names[i]);
fputs(" (", _out);
output_cell(nutrient_units[i]);
fputs(")\"", _out);
}
fputc('\n', _out);
// data
int nLines=0;
for (int i=0; i<MAXFOODS; i++) if (food_lines[i]) {
int n = output_line(food_lines[i]);
if (n != 14) printf("WTF? Food should have 14 fields, not %d.\n", n); // sanity check
for (int j=0; j<MAXNUTRIENTS; j++) if (nutrient_names[j]) {
fputs(",\"",_out);
if (food_nutrients[i][j]) output_cell(food_nutrients[i][j]);
fputc('\"', _out);
}
fputc('\n', _out);
if (++nLines % 100 == 0) printf("Wrote %d foods\n", nLines);
}
printf("Wrote %d foods\n", nLines);
if (nLines==nFoods) printf("Done.\n");
fclose(_out);
}
else perror("Cannot write to "OUTPUT_FILENAME);
}
free(food_des);
free(nutr_def);
free(nut_data);
for (int i=0; i<MAXNUTRIENTS; i++) if (food_nutrients[i]) free(food_nutrients[i]);
return 0;
}