/* Copyright 2002 Daniel Egnor.  See LICENSE file.
 *
 * This program is part of the processing chain to convert TIGER/Line data
 * into an index ("map") for rapid address lookup.  It reads "step2" data 
 * as output by "geo-1-to-2" and "geo-fips-to-2" and as filtered by 
 * "geo-2-to-2a".  It writes "step3" data for consumption by "geo-3-to-map".  
 * On the way, it continues building the map file.
 *
 * The input records associate names with "zones" of street address location
 * data.  However, because a name may appear in multiple different areas
 * (there are a great many "Main St" entries, for example), a name may be
 * associated with many zones.  In this step we create a "range" table
 * associated with each unique name.  The range table contains a pointer
 * for each zone associated with the name, in sorted order by zone offset.
 *
 * That way, in the query process, names (for streets, cities, and states)
 * can be merged to find zones of overlap, which will correspond to the
 * area of interest.  The overlap zones can be searched for the actual
 * address of interest to find geographical coordinates (usually by
 * interpolation within a city block).
 *
 * The input format is familiar:
 *
 * NEMain Street  //  12345   323232312345   3232454
 *  |<-----name---//-><zip><-offset-><zip><-offset->
 * type/parity    //  <----start----><-----end----->
 *
 * The output format contains one line per unique name:
 *
 * REMain Street  //      454545
 *  |<-----name---//-><-offset->
 * type/parity
 *
 * The input must be sorted, and the output will be sorted.  Sorting obviates
 * the need to have distinct 'start' and 'end' range offset values; the end of
 * one range is the start of the next.  (Sorting is also necessary to find all
 * unique names, and in the next step to build a sorted name table in the
 * map to allow quick binary search name lookup.) */

#include "io.h"

#include <stdio.h>
#include <string.h>

/* Look up a zone offset from input data.  Not all input data lines will have
 * a zone offset; some (such as those that come from FIPS) will only have a
 * zip code.  We use the zip code table, which has already been built in the
 * map by this point, to convert that into a zone offset.
 * Arguments: index = map file to use for lookup
 *            zip_offset = offset of zip code table in map
 *            end_offset = end of zone area in map (used for sanity check)
 *            line = zip code/offset pair from input line 
 * Return: zone offset, or -1 on error */
static int get_offset(struct io_file *index,
	int zip_offset,int end_offset,
	const char *line) 
{
	int addr;
	if (' ' == line[14]) {
                /* No zone offset in input data; use zip code instead */
		const int zip = io_strntoi(line,5);
		if (zip < 0 || zip > 99999) {
			fprintf(stderr,"warning: invalid zipcode %d\n",zip);
			return -1;
		}

		if (io_in_i4(index,zip_offset + 4 * zip,&addr) < 0)
			return -1;
	}
	else
                /* Use supplied zone offset */
		addr = io_strntoi(line + 5,10);

        /* Sanity check resulting zone offset */
	if (addr < zip_offset + 4*100000 || addr > end_offset) {
		fprintf(stderr,"warning: invalid offset %d\n",addr);
		return -1;
	}

	return addr;
}

int main(int argc,char *argv[]) {
	char line[256],prev[256] = "";
	struct io_file *index;
	int end,zip_offset;
	int name_pointer,name_offset;
	int end_pointer,end_offset;

	int addr_begin = 0,addr_end = 0;

	if (2 != argc) {
		fprintf(stderr,"usage: %s map-file < step2a-data > step3-data\n",argv[0]);
		return 2;
	}

	index = io_open(argv[1]);
	if (NULL == index) return 1;

	end = 0;
	end = io_in_i4(index,end,&zip_offset);
	end = io_in_i4(index,name_pointer = end,&name_offset);
	end = io_in_i4(index,end_pointer = end,&end_offset);
	if (end < 0) return 1;

	if (name_offset != end_offset)
		fputs("warning: overwriting previous name data\n",stderr);

	end = name_offset;

	while (NULL != fgets(line,sizeof line,stdin)) {
		int this_begin,this_end;

		if (strlen(line) < 84) {
			fputs("warning: truncated input line\n",stderr);
			continue;
		}

		if ('N' != line[0]) {
			fputs("warning: invalid input record\n",stderr);
			continue;
		}

		if (strncasecmp(line,prev,84) < 0) {
			fputs("warning: input line out of order\n",stderr);
			continue;
		}

                /* Get the zone associated with the current input line */
		this_begin = get_offset(index,zip_offset,name_offset,line + 54);
		this_end = get_offset(index,zip_offset,name_offset,line + 69);
		if (this_begin < 0 || this_end < 0) return 1;

		if (strncasecmp(line,prev,54) || this_begin > addr_end) {
                        /* New name, or discontiguous zone -> new entry
                         * in the current range table */
			if (addr_begin != addr_end) {
				end = io_out_i4(index,end,addr_begin);
				end = io_out_i4(index,end,addr_end);
				if (end < 0) return 1;
			}

                        /* New name -> new line of output */
			if (strncasecmp(line,prev,54)) {
				memcpy(prev,line,54);
				printf("R%.53s%10d\n",line + 1,end);
			}

			addr_begin = this_begin; 
			addr_end = this_end;
		}
		else if (this_end > addr_end)
			addr_end = this_end; /* combine overlapping ranges */
	}

	if (addr_begin != addr_end) {
		end = io_out_i4(index,end,addr_begin);
		end = io_out_i4(index,end,addr_end);
		if (end < 0) return 1;
	}

        /* Because the next range's beginning is the previous range's end,
         * we need to write a dummy range entry at the end to terminate the
         * last "real" range. */
	printf("R~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~%10d\n",end);

	if (io_out_i4(index,name_pointer,end) < 0
	||  io_out_i4(index,end_pointer,end) < 0)
		return 1;

	io_close(index);
	return 0;
}

