/* Copyright 2002 Daniel Egnor.  See LICENSE file.
 *
 * This file reads the output of goo-parsehandler-geocode.cc
 * (the "--geocode" option to goo-ripper) and builds an index file.
 * It needs to be run twice; the second time ("stage 2"), it will
 * be able to construct a "heap" which must lie after the stuff it
 * built on the first time.  The heap contains various variable-length
 * strings and whatnot.  It's not a real heap, just a sequence of stuff. */

extern "C" {
#include "io.h"
}

#include <iostream>
#include <vector>
#include <string.h>

using namespace std;

int main(int argc, char **argv) {
  if (argc != 2) {
    cerr << "usage: " << argv[0] << " index-file < index-data" << endl;
    return 2;
  }

  io_file * const index = io_open(argv[1]);
  if (NULL == index) return 1;

  int end, heap, region_pointer, term_pointer, end_pointer;

  /* If the index hasn't been built yet, this will emit an I/O EOF warning.
   * Oh well, the user can just ignore it.  Eventually we should fix this. */
  end = heap = 0;
  end = io_in_i4(index, region_pointer = end, NULL);
  end = io_in_i4(index, term_pointer = end, NULL);
  end = io_in_i4(index, end_pointer = end, &heap);

  if (end < 0 || 0 == heap) {
    end = heap = 0;
    end = io_out_i4(index, region_pointer = end, 0);
    end = io_out_i4(index, term_pointer = end, 0);
    end = io_out_i4(index, end_pointer = end, 0);
  }

  if (end < 0) return 1;

  cerr << "notice: indexing stage " << (heap ? 2 : 1) << endl;

  vector<int> batch;
  int document = 0, document_end = 0;

  const int document_size = 8; // WARNING MAGIC

  char line[8192]; // limits maximum URL+title size
  char prev[sizeof line] = "";
  while (cin.getline(line, sizeof line)) {
    if (line[0] != 'X') {
      cerr << "warning: invalid input record: \"" << line << "\"" << endl;
      continue;
    }

    if (strcasecmp(line, prev) < 0) {
      cerr << "warning: \"" << prev << "\" > \"" << line << "\"" << endl;
      continue;
    }

    if (line[1] != prev[1]) {
      if ('D' == prev[1]) {
        end = io_out_i4(index, io_out_i4(index, end, 0), heap);
        if (end < 0) return 1;
      }
      else if ('L' == prev[1]) {
        while (document < document_end) {
          if (io_out_i4(index, document, end) < 0) return 1;
          document += document_size;
        }
        end = io_out_i4(index, 
              io_out_i4(index, 
              io_out_i4(index, end, 0), 0), heap);
        if (end < 0) return 1;
      }
      else if ('R' == prev[1]) {
        end = io_out_i4(index,
              io_out_i4(index,
              io_out_i4(index,
              io_out_i1(index, end, 127), 0), 0), heap);
        if (end < 0) return 1;
      }

      if ('L' == line[1]) {
        document_end = end;
        document = batch.empty() ? 0 : batch[0];
      }
      else if ('R' == line[1]) {
        if (io_out_i4(index, region_pointer, end) < 0) return 1;
      }
      else if ('T' == line[1]) {
        if (io_out_i4(index, term_pointer, end) < 0) return 1;
      }
    }

    const int len = strlen(line);
    if ('D' == line[1]) {
      if (len < 20) {
        cerr << "warning: input line truncated" << endl;
        continue;
      }

      const int batch_id = io_strntoi(line + 2, 9);
      const int document_id = io_strntoi(line + 11, 9);
      if (batch_id < 0 || document_id < 0) {
        cerr << "warning: invalid document id" << endl;
        continue;
      }

      if (batch_id >= (int) batch.size()) {
        batch.resize(1 + batch_id, end);
        document = 0;
      }

      while (document <= document_id) {
        end = io_out_i4(index, io_out_i4(index, end, 0), heap);
        if (end < 0) return 1;
        ++document;
      }

      if (heap) {
        heap = io_out(index, heap, line + 20, len - 20);
        if (heap < 0) return 1;
      }
    }
    else if ('L' == line[1]) {
      if (strlen(line) < 39) {
        cerr << "warning: input line truncated" << endl;
        continue;
      }

      const int batch_id = io_strntoi(line + 2, 9);
      const int document_id = io_strntoi(line + 11, 9);
      if (batch_id < 0 || document_id < 0) {
        cerr << "warning: invalid document id" << endl;
        continue;
      }

      if (batch_id >= (int) batch.size() || 0 == batch[batch_id]) {
        cerr << "warning: missing document id" << endl;
        continue;
      }

      const int doc = batch[batch_id] + document_id * document_size;
      while (document <= doc) {
        if (io_out_i4(index, document, end) < 0) return 1;
        document += document_size;
      }

      end = io_out_i4(index, io_out_i4(index, io_out_i4(index, end, 
            io_strntoi(line + 20, 10)),
            io_strntoi(line + 30, 9)), heap);

      if (heap) {
        heap = io_out(index, heap, line + 39, len - 39);
        if (heap < 0) return 1;
      }

      if (end < 0) return 1;
    }
    else if ('R' == line[1]) {
      if (strlen(line) < 34) {
        cerr << "warning: input line truncated" << endl;
        continue;
      }

      if (0 != strncasecmp(line, prev, 16)) {
        /* TODO: wait until we've collected at least a few? */
        end = io_out_i4(index, io_out_i4(index, io_out_i4(index, 
              io_out_i1(index, end, 
              io_strntoi(line + 2, 2)),
              io_strntoi(line + 4, 6)),
              io_strntoi(line + 10, 6)), heap);
        if (end < 0) return 1;
      }
      else if (0 == strncasecmp(line + 16, prev + 16, 18))
        continue;
      
      if (heap) {
        const int batch_id = io_strntoi(line + 16, 9);
        const int document_id = io_strntoi(line + 25, 9);
        if (batch_id < 0 || document_id < 0)
          cerr << "warning: invalid document id" << endl;
        else if (batch_id >= (int) batch.size() || 0 == batch[batch_id])
          cerr << "warning: missing document id" << endl;
        else
          heap = io_out_i4(index, heap, 
                           batch[batch_id] + document_id * document_size);
      }
    }
    else if ('T' == line[1]) {
      if (strlen(line) < 29) {
        cerr << "warning: input line truncated" << endl;
        continue;
      }

      if (0 != strncasecmp(line, prev, 11)) {
        end = io_out_i4(index, io_out(index, end, line + 2, 9), heap);
        if (end < 0) return 1;
      }

      if (heap) {
        const int batch_id = io_strntoi(line + 11, 9);
        const int document_id = io_strntoi(line + 20, 9);
        if (batch_id < 0 || document_id < 0)
          cerr << "warning: invalid document id" << endl;
        else if (batch_id >= (int) batch.size() || 0 == batch[batch_id])
          cerr << "warning: missing document id" << endl;
        else
          heap = io_out_i4(index, heap, 
                           batch[batch_id] + document_id * document_size);
      }
    }
    else {
      cerr << "warning: invalid input record type" << endl;
      continue;
    }

    strcpy(prev, line);
  }

  if ('T' == prev[1])
    end = io_out_i4(index, io_out(index, end, "~~~~~~~~~", 9), heap);
  if (end < 0 || io_out_i4(index, end_pointer, end) < 0) 
    return 1;

  return 0;
}

