/* Copyright 2002 Daniel Egnor.  See LICENSE file.
 *
 * This program is part of the processing chain to convert TIGER/Line data
 * into an index ("map") for rapid address lookup.  It reads "step2" data as
 * generated by "geo-1-to-2" and "geo-fips-to-2", canonicalizes the 
 * geographical feature names according to a set of alias read from 
 * another file.  The alias definitions are included in the output so
 * that queries can be canonicalized to match the input.
 *
 * Input and output records are in the same format:
 *
 * NEMain Street  //  12345   323232312345   3232454
 *  |<-----name---//-><zip><-offset-><zip><-offset->
 * type/parity    //  <----start----><-----end----->
 *
 * Aliases are included in the output by setting the "type/parity"
 * character to 'A' and leaving the zipcode and offset fields blank.
 * The "name" field contains the alias definition, which looks like
 * "Full Name = Alias", e.g. "Street = St" or "North East = NE".
 * The alias input file simply consists of a list of alias definitions,
 * one per line.
 *
 * This whole thing is necessary to avoid confusion over the different
 * abbreviations used in addresses ("Avenue" vs. "Ave" vs. "Av",
 * "North East" vs. "Northeast" vs. "N.E." vs. "NE", and so on).
 *
 * This program does not touch the map itself.
 * */

#include "io.h"

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>

#define D(x)

typedef char name[52];
name *array = NULL;
int array_size = 0,array_alloc = 0;

/* Normalize a string by canonicalizing the spacing, removing punctuation,
 * and deleting parenthetized text.  (FIPS records and street names in TIGER
 * sometimes include parenthetical comments that should be ignored for
 * matching purposes.) */
static void normalize(char *str,int len,char allow) {
        int i,out = 0,depth = 0;
        for (i = 0; i < len; ++i) {
                const char ch = str[i];
                if ('(' == ch)
                        ++depth;
                if (')' == ch && depth > 0)
                        --depth;
		if (isalpha(ch) && 0 != out && isdigit(str[out - 1]))
			;
                else if (0 == depth && (ch == allow || isalnum(ch)))
                        str[out++] = ch;
                else if (0 != out && ' ' != str[out - 1])
                        str[out++] = ' ';
        }

        while (out != len) str[out++] = ' ';
}

/* Comparison function for qsort(). */
static int compare(const void *a,const void *b) {
	const name *na = (const name *) a;
	const name *nb = (const name *) b;
	return strncasecmp(*na,*nb,sizeof *na);
}

/* Binary search for the first string greater than or equal to the
 * supplied value in a sorted list of strings.  Used for alias lookup. */
static int find(int begin,int end,const char *value,int len) {
	const int count = end - begin;
	const int mid = begin + count / 2;

	if (count <= 1) return begin;
	if (len > sizeof *array) len = sizeof *array;

	if (strncasecmp(value,array[mid - 1],len) > 0)
		return find(mid,end,value,len);
	else
		return find(begin,mid,value,len);
}

/* Canonicalize (replace aliases) in the string between "begin" and "end",
 * but look only for matches against a prefix longer than word - begin.
 * This is a relatively complex recursive function because of the issues
 * involved in the detection of multi-word aliases and the requirement to
 * perform recursive alias substitution (one alias may result in something
 * which forms all or part of another alias's definition).
 * Precondition: begin <= word <= end. 
 * Return: Nonzero if an alias was replaced. */
static int substitute(char *begin,char *word,char *end) {
        /* Aliases are matched word-wise; find the next word in the input. */
	char *next = word;
	if (next != end && ' ' != *next) {
		do ++next; while (next != end && ' ' != *next);
		if (next != end) ++next;
	}

	if (next != word) {
                /* Look for alias changes in the rest of the string first */
		if (begin == word)
			substitute(next,next,end);
                /* Look for longer alias matches before trying shorter ones */
		if (substitute(begin,next,end))
			return 1;
	}

        /* Now search the alias table and make a replacement if warranted. */ 
D(fprintf(stderr,"Alias search: '%.*s'\n",word - begin,begin));
	if (begin != word && word - begin < sizeof(*array)) {
		const int len = word - begin;
		const int f = find(0,array_size,begin,len);
D(fprintf(stderr,"   Best match: %.*s\n",(int) sizeof(*array),array[f]));
		if (f < array_size
		&& !strncasecmp(array[f],begin,len)
		&&  '=' == array[f][len]) {
			const char * const rbegin = &array[f][len + 2];
			const char * rend = &array[f][sizeof(*array)];

			while (rend != rbegin && ' ' == rend[-1]) --rend;
			if (rend - rbegin > word - begin - 1) {
				fprintf(stderr,"warning: ignoring replacement '%.*s' longer than original '%.*s'\n",
				        rend - rbegin,rbegin,
				        len - 1,array[f]);
				return 0;
			}
D(fprintf(stderr,"      Replacing with: %.*s\n",rend - rbegin,rbegin));
			memmove(begin + (rend - rbegin),word - 1,end - word + 1);
			memcpy(begin,rbegin,rend - rbegin);
			word = begin + (rend - rbegin) + (end - word + 1);
			while (word != end) *word++ = ' ';
D(fprintf(stderr,"      Result is: %.*s\n",end - begin,begin));

			substitute(begin,begin,end);
			return 1;
		}
	}

	return 0;
}

int main(int argc,char *argv[]) {
	char line[256];
	FILE *alias;

	if (argc != 2) {
		fprintf(stderr,"usage: %s alias-file < step2-data | sort -f > step2a-data\n",argv[0]);
		return 2;
	}

	alias = fopen(argv[1],"r");
	if (NULL == alias) {
		fprintf(stderr,"error: can't open %s\n",argv[1]);
		return 1;
	}

        /* Read the alias table into memory. */
	while (NULL != fgets(line,sizeof line,alias)) {
		int len = strlen(line);
		while (len < sizeof(line)) line[len++] = ' ';
		normalize(line,sizeof(line),'=');
		printf("NA%.52s                              \n",line);

		if (array_size == array_alloc) {
			array_alloc = array_alloc ? 2 * array_alloc : 10;
			array = realloc(array,array_alloc * sizeof *array);
			if (NULL == array) {
				fputs("error: out of memory\n",stderr);
				return 1;
			}
		}

		memcpy(array[array_size++],line,sizeof *array);
	}

        /* Sort the alias table so we can use binary search. */
	qsort(array,array_size,sizeof *array,compare);

        /* Read the input, convert aliases, write it out. */
	while (NULL != fgets(line,sizeof line,stdin)) {
		if (strlen(line) < 84) {
			fputs("warning: truncated input line\n",stderr);
			continue;
		}

		if ('N' != line[0]) {
			fputs("warning: invalid input record\n",stderr);
			continue;
		}

		normalize(line + 2,52,'\0');
		substitute(line + 2,line + 2,line + 54);
		fputs(line,stdout);
	}

	return 0;
}

