/* Copyright 2002 Daniel Egnor.  See LICENSE file.
 *
 * This file, which hooks into the modified Google "ripper", handles
 * preparsed input, scans for addresses, collects keywords, and outputs
 * a simple text file which can be collected into a keyword and geographical
 * index by "geo-index". */

#include "goo-handler-parser.h"

extern "C" {
#include "geo.h"
#include "geodesy.h"
}

#include <iostream>
#include <iomanip>
#include <string>
#include <set>
#include <map>

#include <string.h>
#include <ctype.h>
#include <math.h>

using namespace std;

class GeoCodeParseHandler : public ParseHandler {
 public:
  GeoCodeParseHandler(const char* index,int batch) 
    : index_(io_open(index)), 
      batch_(batch), document_(0),
      buffer_size_(0), 
      in_head_(false) {
  }

  ~GeoCodeParseHandler() {
    io_close(index_);
  }

  void AddTerm(const char* text, int termlen, int face, int size) {
    if (0 == termlen) return; // Just in case.

    if (in_head_) title_.append(text, termlen);

    if (buffer_size_ > 0) {
      while (buffer_size_ + termlen + 1 > static_cast<int>(sizeof buffer_)) {
        if (0 == buffer_size_) return; // BUGBUG?
        EatBuffer();
      }
    }

    if (isdigit(text[0]) || buffer_size_ > 0) {
      if (buffer_size_ > 0)
        buffer_[buffer_size_++] = ' ';
      memcpy(buffer_ + buffer_size_, text, termlen);
      buffer_size_ += termlen;
    }

    term w;
    int i = 0;
    while (i < (int) sizeof(w.t) && termlen > 0) {
      if (isalnum(*text))
        w.t[i++] = *text;
      ++text;
      --termlen;
    }
    if (i > 0) {
      while (i < (int) sizeof(w.t))
        w.t[i++] = ' ';
      terms_.insert(w);
    }
  }

  void AddPunctuation(const char* text, int termlen, int face, int size) {
    if (in_head_) 
      for (int i = 0; i < termlen; ++i)
        if (text[i] != '\n')
          title_.append(1, text[i]);
  }

  void WhitespaceEndedTerm() {
    if (in_head_) title_.append(1, ' ');
  }

  void AddHead(const char* fields, int fieldlen) {
    in_head_ = true;
    title_.erase();
  }

  void AddHeadDone() { in_head_ = false; }
  void AddBody(const char*, int) { in_head_ = false; }
  void AddAnchor(const char*, int) { in_head_ = false; }
  void ChangeFontColor(const char*, int) { in_head_ = false; }
  void ChangeBGColor(const char*, int) { in_head_ = false; }
  void AddImage(const char*, int) { in_head_ = false; }
  void AddForm(const char*, int) { in_head_ = false; }
  void AddApplet(const char*, int) { in_head_ = false; }
  void AddArea(const char*, int) { in_head_ = false; }
  void AddFrame(const char*, int) { in_head_ = false; }

  void EndDocument(const Document* doc) {
    if (title_.size() > 120) {
      title_.erase(120);
      title_.append("...");
    }

    const string::size_type title_size = title_.size();
    if (title_size > 1 && ' ' == title_[title_size - 1])
      title_.erase(title_size - 1);
    cout << "XD" << setw(9) << batch_ << setw(9) << document_
         << doc->url() << "|" << title_ << "\n";
    title_.erase();

    for (set<term>::const_iterator w = terms_.begin(); w != terms_.end(); ++w) {
      cout << "XT";
      cout.write(w->t, sizeof(w->t));
      cout << setw(9) << batch_ << setw(9) << document_ << "\n";
    }
    terms_.clear();

    EatBuffer();
    map<pair<double,double>,geo_location>::const_iterator l;
    for (l = locations_.begin(); l != locations_.end(); ++l) {
      cout << "XL" << setw(9) << batch_ << setw(9) << document_
           << setw(10) << (int) (1000000 * l->second.at.longitude)
           << setw(9) << (int) (1000000 * l->second.at.latitude)
           << l->second.at.address << " " << l->second.street_name;
      if (l->second.city_name[0])
        cout << ", " << l->second.city_name;
      if (l->second.state_name[0])
        cout << ", " << l->second.state_name;
      if (l->second.zip_code > 0)
        cout << " " << setfill('0') << setw(5) 
             << l->second.zip_code << setfill(' ');
      cout << "\n";

      cout << "XR 0     0     0"
           << setw(9) << batch_ << setw(9) << document_ << "\n";
      const double east = geo_mercator_easting(l->second.at.longitude);
      const double north = geo_mercator_northing(l->second.at.latitude);
      for (int exponent = 1; exponent <= 16; ++exponent) {
        const int divisor = 1 << exponent;
        const int east_code = abs((int) floor(east * divisor));
        const int north_code = abs((int) floor(north * divisor));
        if (north_code > 65536) break;
        cout << "XR" << setw(2) << exponent
             << setw(6) << east_code
             << setw(6) << north_code
             << setw(9) << batch_ << setw(9) << document_ << "\n";
      }
    }
    locations_.clear();

    ++document_;
  }

 private:
  void EatBuffer() {
    if (0 == buffer_size_) return;

    struct geo_location where;
    if (geo_find(index_, buffer_, buffer_size_, &where))
      locations_[make_pair(where.at.longitude,where.at.latitude)] = where;

    int ptr = 0;
    do {
      while (ptr != buffer_size_ && ' ' != buffer_[ptr]) ++ptr;
      while (ptr != buffer_size_ && ' ' == buffer_[ptr]) ++ptr;
    } while (ptr != buffer_size_ && !isdigit(buffer_[ptr]));

    memmove(buffer_, buffer_ + ptr, buffer_size_ - ptr);
    buffer_size_ -= ptr;
  }

  struct term { 
    char t[9]; 
    bool operator<(const term &w) const { return strncasecmp(t, w.t, 9) < 0; }
  };

  io_file* const index_;
  const int batch_;
  int document_;
  set<term> terms_;
  map<pair<double,double>,geo_location> locations_;

  char buffer_[120]; // REVIEW: should be string or vector or something.
  int buffer_size_;

  string title_;
  bool in_head_;
};

ParseHandler* MakeGeoCodeHandler(const char* index, int batch) {
  return new GeoCodeParseHandler(index, batch);
}

