#include "parser.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>

#define MAX_DOMAINS 19
#define MAX_FILE_EXTENSIONS 88
#define MAX_SPECIAL_WORDS 3
#define MAX_WEB_EXTENSIONS 12
#define URL_SIZE 1024

static char *domain;
static char *http_domain;
static char *actual_url;
static char *title;

static char http[] = "http://";

/* http://www.seobythesea.com/2006/01/googles-most-popular-and-least-popular-top-level-domains/ */
static char web_domains[MAX_DOMAINS][8] =
{ ".com", ".org", ".edu", ".gov", ".net", ".br",
  ".uk", ".ca", ".de", ".jp", ".fr", ".au", ".us",
  ".ru", ".ch", ".nl", ".se", ".no", ".es"};

static char file_extensions[MAX_FILE_EXTENSIONS][8] =
{ ".au", ".avi", ".bak", ".bib", ".bin", ".bmp", ".bz2", ".c", ".class", ".cpp", ".dat",
  ".dgz", ".djvu", ".dmg", ".doc", ".docx", ".dvi", ".dvi", ".eps", ".exe",
  ".f", ".f95", ".gb", ".gbk", ".gif", ".gz", ".gz2", ".h", ".hpp", ".ico", ".idx",
  ".in", ".jar", ".java", ".jpeg", ".jpg", ".js", ".list", ".log", ".lp",
  ".ltx", ".lua", ".m", ".mid", ".mod", ".mov", ".mp", ".mp3", ".mp4", ".mpeg", ".mpg",
  ".odp", ".ods", ".orc", ".out", ".pdf", ".pgm", ".png", ".pps", ".ppt", ".pptx",
  ".ps", ".py", ".pyc", ".r", ".rar", ".rtf", ".scn", ".sco", ".sh", ".sol", ".sty", ".svg", ".swf",
  ".sxw", ".tar", ".tex", ".tgz", ".toy", ".txt", ".vmd", ".w", ".wav", ".wma",
  ".wmv", ".xls", ".xlsx", ".zip"};

static char special_words[MAX_SPECIAL_WORDS][16] =
{ "file:", "mailto:", "javascript:"};

/* http://en.wikipedia.org/wiki/List_of_file_formats#Webpage */
static char web_extensions[MAX_WEB_EXTENSIONS][8] =
{ ".html", ".htm", ".xhtml", ".xht", ".xml", ".asp",
  ".aspx", ".jsp", ".php", ".phtml", ".shtml", ".stm"};

/* ---------------------------------- */
void adjust_hexa_char (char *url) {
  unsigned int i, j, len = strlen(url), dec;
  char c, aux[8], buffer[URL_SIZE];

  /* Se houver um caracter hexadecimal na url */
  if (strstr(url, "%")) {
    /* Como sabemos que os ascii em hexa têm apenas 2 digitos, o aux[2]
       com certeza é o \0 */
    aux[2] = '\0';
    i = 0; j = 0;

    while (i < len) {
      c = url[i];
      if (c == '%') {
        aux[0] = url[i + 1];
        aux[1] = url[i + 2];
        sscanf(aux, "%x", &dec);
        i += 2;
        c = dec;
      }
      buffer[j++] = c;
      i++;
    }
    buffer[j] = '\0';
    strcpy(url, buffer);
  }
}

/* ---------------------------------- */
void adjust_special_char (char *url) {
  char *position;

  if ((position = strstr(url, "#"))) strcpy(position, "\n\0");
  if ((position = strstr(url, "?"))) strcpy(position, "\n\0");
  if ((position = strstr(url, ";"))) strcpy(position, "\n\0");
  if ((position = strstr(url, "&"))) strcpy(position, "\n\0");
  if ((position = strstr(url, "="))) strcpy(position, "\n\0");
  if ((position = strstr(url, ","))) strcpy(position, "\n\0");
}

/* ---------------------------------- */
char *adjust_local_url(char* url) {
  int i, j, k;
  char *aux = malloc(URL_SIZE * sizeof(char));

  /* Se houver .. na url */
  if (strstr(url, "../") || strstr(url, "./")) {
    /* Enquanto nao achar "..", copia a url em aux */
    for (i = 0, k = 0; url[i] != '\0'; i++) {
      /* Se achou uma ocorrencia "./" */
      if (url[i] == '.' && url[i+1] == '/') {
        i++;
      }
      /* Se achou uma ocorrencia ".." */
      else if (url[i] == '.' && url[i+1] == '.' && url[i+2] == '/') {
        /* Procura em aux o ultimo diretorio */
        for (j = k - 2; j >= 0; j--) {
          if (aux[j] == '/') {
            k = j + 1;
            break;
          }
        }
        i += 2;
      }
      else aux[k++] = url[i];
    }

    aux[k] = '\0';
    strcpy(url, aux);
  }

  free(aux);
  if (strstr(url, domain)) return url;
  else return NULL;

}

/* ---------------------------------- */
int url_valid_char (char c) {
  if (c == '>' || c == '<' || c == '\"' || c == ' ' || c == '=')
    return 0;

  return 1;
}

/* ---------------------------------- */
char *url_analysis (char *url) {
  int i, j, k, len_ext, len_url;
  char aux[8], *updated_url, *ptr;

  updated_url = malloc(URL_SIZE * sizeof(char));
  len_url = strlen(url);

  /* Procurando o indice do ultimo caracter alfanumerico */
  while (!isalnum(url[len_url])) len_url--;

  /* Analisando se a url é um arquivo */
  for (i = 0; i < MAX_FILE_EXTENSIONS; i++) {
    len_ext = strlen(file_extensions[i]);
    for (j = len_ext - 1; j >= 0; j--) {
      k = len_url + j - (len_ext - 1);
      if (isalnum(file_extensions[i][j])) {
        if (!(url[k] == file_extensions[i][j] || url[k] == file_extensions[i][j] - 32))
          break;
      }
      else if (!(url[k] == file_extensions[i][j])) break;
    }
    /* Conseguimos um match numa extensao, logo url e arquivo */
    if (j < 0) return NULL;
  }

  for (i = 0; i < MAX_SPECIAL_WORDS; i++)
    if (strstr(url, special_words[i]))
      return NULL;

  /* Converte caracteres hexadecimais na url */
  adjust_hexa_char(url);

  /* Tira caracteres especiais */
  adjust_special_char(url);

  /* Tira o index.html, caso houver */
  if ((ptr = strstr(url, "index.html"))) strcpy(ptr, "\n\0");
  else if ((ptr = strstr(url, "index.php"))) strcpy(ptr, "\n\0");
  else if ((ptr = strstr(url, "index.htm"))) strcpy(ptr, "\n\0");

  /* Verifica se o dominio esta contido na url */
  if (strstr(url, domain)) return adjust_local_url(url);
  /* Como no dominio nao esta na url, um http leva a um link fora do dominio */
  if (strstr(url, http)) return NULL;

  /* Checando se algum outro tipo de dominio esta presente na url */
  for (i = 0; i < MAX_DOMAINS; i++) {
    strcpy(aux, web_domains[i]);
    /* Marcando onde está o '\0' de web_domain[i] */
    if (aux[3] == '\0')
      j = 3;
    else
      j = 4;

    aux[j+1] = '\0';

    /* Suponhamos que web_domain[i] = ".abc" */
    /* Verifica se ".abc." está na url */
    aux[j] = '.';
    if (strstr(url, aux))
      return NULL;

    /* Verifica se ".abc/" está na url */
    aux[j] = '/';
    if (strstr(url, aux))
      return NULL;

    /* Verifica se ".abc" está na url */
    aux[j] = '\0';
    if (strstr(url, aux))
      return NULL;
  }

  /* Se chegou ate aqui, a url nao e global. O trecho seguinte a fara global */
  if (url[0] == '/') strcpy(updated_url, http_domain);
  else strcpy(updated_url, actual_url);

  strcat(updated_url, url);

  return adjust_local_url(updated_url);
}

/* ---------------------------------- */
void parser_init (char *string)
{
  int i, len;

  len = strlen(string);

  domain = malloc(++len * sizeof(char));
  http_domain = malloc(++len * sizeof(char));
  actual_url = malloc(URL_SIZE * sizeof(char));
  title = malloc(URL_SIZE * sizeof(char));

  strcpy(domain, string);
  strcpy(http_domain, string);

  /* Tira o http:// do dominio */
  if (strstr(domain, http)) {
    strcpy(domain, domain + 7);
    strcpy(http_domain, http_domain + 7);
  }

  /* Tira a possivel '/' no final */
  for (i = 0; domain[i] != '\0'; i++);
  if (domain[i-1] == '/')
    domain[i-1] = '\0';

  for (i = 0; domain[i] != '\0'; i++)
    if (domain[i] == '/' || domain[i] == '\n') break;
  http_domain[i] = '\0';

}

/* ---------------------------------- */
void adjust_actual_url (char *string)
{
  int i, j, k;

  for (i = strlen(string) - 1; i >= 0; i--) {
    if (string[i] == '/') {
      string[i+1] = '\0';
      break;
    }
    else if (!isspace(string[i])) {
      for (j = i; j >= 0; j--) {
        if (string[j] == '/') {
          for (k = 0; k < MAX_WEB_EXTENSIONS; k++) {
            if (strstr(string + j + 1, web_extensions[k])) {
              string[j + 1] = '\0';
              strcpy(actual_url, string);
              return;
            }
          }
        }
      }
      if (j < 0) {
        string[i+1] = '/';
        string[i+2] = '\0';
        break;
      }
    }
  }

  strcpy(actual_url, string);
}

/* ---------------------------------- */
int parse (char *file_in, char *file_out, char *string) {
  FILE *input, *output;
  char href[] = "href";
  char a_tag[] = "<a";
  char title_tag[] = "<title>";
  char title_tag_upper[] = "<TITLE>";
  char buffer[URL_SIZE];
  char aux;
  char *checked_url;
  int i, j, k, url_found = 0, title_flag = 1;

  input = fopen(file_in, "r");
  output = fopen(file_out, "w");
  adjust_actual_url(string);

  i = 0;
  k = 0;
  while (!feof(input)) {
    aux = fgetc(input);

    if (aux == title_tag[k] || aux == title_tag_upper[k])
      k++;
    else if (!isspace(aux))
      k = 0;

    /* Se k == 6, encontramos a tag <title>*/
    if (k == 7 && title_flag) {
      while (isspace(aux = fgetc(input)));

      k = 0;
      if (aux != '<') {
        title[k++] = aux;
        while ((aux = fgetc(input)) != '<') {
          if (aux == '\n') title[k++] = ' ';
          else title[k++] = aux;
        }

        k--;
        while (isspace(title[k])) k--;
        title[k + 1] = '\0';
        title_flag = 0;
      }
    }

    if (aux == a_tag[i])
      i++;
    else
      i = 0;

    /* Se i == 2, estamos dentro de uma tag <a ... > */
    if (i == 2) {
      /* Enquanto a tag nao for fechada */
      i = 0;
      while (aux != '>') {

        aux = fgetc(input);
        if (aux == href[i])
          i++;
        else
          i = 0;

        /* Se i == 4, achamos uma ocorrencia de "href" no arquivo */
        if (i == 4) {
          while (!url_valid_char(aux = fgetc(input)));

          j = 0;
          buffer[j++] = aux;
          while ((aux = fgetc(input)) == '=' || url_valid_char(aux)) {
            buffer[j++] = aux;
          }
          buffer[j++] = '\n';
          buffer[j] = '\0';

          if ((checked_url = url_analysis(buffer))) {
            fputs(checked_url, output);
            url_found++;
          }
        }
      }
    }
  }

  fclose(input);
  fclose(output);

  return url_found;

}

void parser_cleanup ()
{
  free(domain);
  free(http_domain);
  free(actual_url);
  free(title);
}

char *get_title() {
  return title;
}

