/* File : urlparse.c - Dillo
 *
 * Copyleft (C) 2001 Livio Baldini Soares <livio@linux.ime.usp.br>
 *
 * Parse and normalize all URL's inside Dillo.
 * 
 */


#include <glib.h>
#include <string.h>  
#include <stdio.h>  
#include <stdlib.h>  /* for strtol()          */
#include <math.h>    /* for log10() and pow() */
#include <ctype.h>   /* for isspace()         */

#include "urlparse.h"
#include "misc.h"

/*
 *  Initialize the URL structure
 */
void a_DilloUrl_init(DilloURL *url)
{
   url->original_url  = NULL;
   url->protocol      = NULL;       /* HTTP ? */
   url->hostname      = NULL;
   url->path          = NULL;
   url->port          = 0;          /* 80 ? */
}

/*
 *  Return the protocol set in this URL 
 *  for example: `http', `ftp', `file', ...
 */
gchar *a_Url_get_protocol(const DilloURL *url)
{
   return url->protocol;
}

/*
 *  Return the hostname set in this URL
 *  for example: `dillo.sourceforge.net', `www.google.com', ...
 */
gchar *a_Url_get_hostname(const DilloURL *url) 
{
   return url->hostname;
}

/*
 *  Return the file/directory path set in this URL
 *  for example: `cgi-bin/Dillo_query.cgi', `index.html', '/', ...
 */
gchar *a_Url_get_directory(const DilloURL *url) 
{
   return url->path;
}

/*
 *  Return the port set in this URL
 *  for example: 80 (HTTP), 21 (FTP), 443 (HTTPS), ...
 */
gint a_Url_get_port(const DilloURL *url)
{
   return url->port;
}

/*
 *  Set the port in this URL
 */
void a_Url_set_port(DilloURL *url, const gint port)
{
   url->port = port;
}


/* 
 *  Duplicate a Url structure
 */
DilloURL* a_Url_dup(const DilloURL *ori)
{
   DilloURL *url = g_new(DilloURL, 1);
   a_DilloUrl_init(url);

   if (ori->original_url)
      url->original_url = g_strdup(ori->original_url);

   if (ori->protocol)
      url->protocol = g_strdup(ori->protocol);

   if (ori->hostname)
      url->hostname = g_strdup(ori->hostname);

   if (ori->path)
      url->path = g_strdup(ori->path);

   url->port = ori->port;

   return url;
}

/*
 *  Transform an URL string into the respective DilloURL.
 *  If we have an URL string =  "http://dillo.sourceforge.net:8080/index.html"
 *  Then the resulting DilloURL should be:
 *  DilloURL = { 
 *               protocol = "http",
 *               hostname = "dillo.sourceforge.net",
 *               path     = "index.html",
 *               port     =  8080,
 *  }
 *
 *  Return NULL if URL is badly formed, i.e. doesn't have a `hostname'.
 */   
DilloURL *a_string_to_Url(const gchar *urlstring)
{
   DilloURL *url = g_new(DilloURL, 1);
   char *ptr, *ptr2;
   char *endptr = (char *)urlstring + strlen(urlstring);

   a_DilloUrl_init(url);

   /* Initialize original URL */
   url->original_url = strdup(urlstring);

   while (urlstring && isspace(*urlstring)) urlstring++;

   /* Try to find protocol, must me something like,
      prot:/ or prot://
   */
   ptr = strstr(urlstring, ":/");
   if (ptr) {
      url->protocol = g_malloc(ptr - urlstring + 1);
      strncpy(url->protocol, urlstring, ptr - urlstring);
      url->protocol[ptr-urlstring] = 0;
      urlstring = ++ptr;

      if (strcmp(url->protocol, "file"))
	 while (*urlstring == '/')
	    urlstring++;
   } else if ( (ptr = strchr(urlstring,  ':')) ) { /* about method has no '/' */
      url->protocol = g_malloc(ptr - urlstring + 1);
      strncpy(url->protocol, urlstring, ptr - urlstring);
      url->protocol[ptr-urlstring] = 0;
      urlstring = ++ptr;

      if (strcmp(url->protocol, "about"))
	 return NULL;
   }

   /* Now try to find hostname, must be before the first '/' */
   if (url->protocol && strcmp(url->protocol, "file") && strcmp(url->protocol, "about")) {
      ptr = strchr(urlstring, (int)'/');
      ptr2 = strchr(urlstring, (int)':');

      if (ptr2 && ( (!ptr) || (ptr && (ptr2 < ptr)) ))
	 ptr = ptr2;      
     
      if (! ptr)
	 ptr = endptr;
     
      url->hostname = g_malloc(ptr - urlstring+1);
      strncpy(url->hostname, urlstring, ptr - urlstring);
      url->hostname[ptr-urlstring] = 0;
      urlstring = ptr;
   }

   /* Now check for port */
   if (*urlstring == ':') {
      ptr = strchr(urlstring, (int)'/');

      if (!ptr)
	 ptr = endptr;

      urlstring++;
      url->port = strtol(urlstring, NULL, 10);
      urlstring = ptr;
   }
   
   if (*urlstring)
      urlstring++;

   /* Now register PATH */
   ptr = endptr;
   if (urlstring < ptr) {
      url->path = malloc(ptr - urlstring + 2);
      url->path[0] = '/'; url->path[1] = 0;   /* path always start with '/' */
      strcat(url->path, urlstring);
   }

   /*   g_print("(a_string_to_Url):\n");
   g_print("\tprotocol:%s\n", url->protocol);
   g_print("\thostname:%s\n", url->hostname);
   g_print("\tpath:%s\n", url->path);
   g_print("\tport:%d\n", url->port); */

   return url;
}

gchar *a_Url_to_string(const DilloURL *url)
{
   /*   g_print("(a_Url_to_string2): final string: %s\n", url->original_url);  */
   return g_strdup(url->original_url);
   /* return url->original_url; */
}

/*
 *  Transform the DilloURL struct into a single URL string.
 *  If: DilloURL = { 
 *                   protocol = "http",
 *                   hostname = "dillo.sourceforge.net",
 *                   path     = "index.html",
 *                   port     =  0,
 *  }
 *  The final string should be something like:
 *  "http://dillo.sourceforge.net/index.html"
 *
 *  Return NULL if URL is badly formed, i.e. doesn't have a `hostname'.
 */
gchar *a_Url_to_string2(const DilloURL *url)
{
   /*   return url->original_url; */

   gchar *urlstring = NULL;
   gint urlstring_size = 0;
   /*     LBS: I'm realloc'ing urlstring and raising urlstring_size accordingly, */
   /*        as URL's parts are being read. Is this too bad? Is it better to */
   /*        count it all before starting and do just one alloc? */

   g_print("(a_Url_to_string):Top:current URL\n");
   g_print("\tprotocol:%s\n", url->protocol);
   g_print("\thostname:%s\n", url->hostname);
   g_print("\tpath:%s\n", url->path);
   g_print("\tport:%d\n", url->port);
   
   if (!url->hostname && strcmp(url->protocol, "file")) 
      return NULL;

   /*     Set up PROTOCOL */
   if (url->protocol) {
      urlstring_size = strlen(url->protocol)+1;
      urlstring = g_malloc(urlstring_size); 
      strcpy(urlstring, url->protocol); 
   } 
   else { 
      urlstring_size = strlen(DILLO_URL_HTTP_PROTOCOL)+1; 
      urlstring = g_malloc(urlstring_size); 
      strcpy(urlstring, DILLO_URL_HTTP_PROTOCOL); 
      /*        LBS: Always use HTTP as default method ?? */
   }

   /*  LBS: Should we check for FILE protocol, having only one / instead of */
   /*  two (/ /), ie, file:/ against file:/ /  */
   
   if (!strcmp(url->protocol, "file")) { 
      urlstring_size += strlen(":/"); 
      urlstring = g_realloc(urlstring, urlstring_size);  
      strcat(urlstring, ":/"); 
   } 
   else { 
      urlstring_size += strlen("://"); 
      urlstring = g_realloc(urlstring, urlstring_size); 
      strcat(urlstring, "://"); 
   } 
   
   /*  Set up HOSTNAME */
   if (strcmp(url->protocol, "file")){ 
      urlstring_size += strlen(url->hostname); 
      urlstring = g_realloc(urlstring, urlstring_size); 
      strcat(urlstring, url->hostname); 
   } 
   
   /*  Set up PORT */
   if (url->port) { 
      gint portstring_size; 

      portstring_size = log10(url->port); 
      if (pow(10, portstring_size) != url->port) 
  	 portstring_size++; 
      if (!portstring_size) 
  	 portstring_size++; 

      portstring_size++;  /* for the ':' before port number */

      g_realloc(urlstring, urlstring_size + portstring_size); 
      sprintf(urlstring+urlstring_size, ":%d", url->port); 
      urlstring_size += portstring_size; 
   } 

   if (url->hostname) { 
      urlstring_size++; 
      g_realloc(urlstring, urlstring_size); 
      strcat(urlstring, "/"); 
   } 

   if (url->path) { 
      urlstring_size += strlen(url->path); 
      g_realloc(urlstring, urlstring_size); 
      strcat(urlstring, url->path); 
   } 
   
   g_print("(a_Url_to_string): final string: %s:%d:%d\n", urlstring, urlstring_size, strlen(urlstring)); 

   return urlstring; 
}


/*
 * URL parsing routines ====================================================
 */

/*
 *  This routine checks if two DilloURL's are "the same", i.e.
 *  they point to the same place. 
 *  Returns 0 if they are the same, 1 otherwise.
 */
gint a_Url_cmp(const DilloURL* A, const DilloURL *B)
{
   gchar *a, *b;

   if (A == B)
      return 0;

   a = a_Url_to_string(A);
   b = a_Url_to_string(B);

   if (!strcmp(a, b)) { /* same */
      g_free(a);
      g_free(b);
      return 0;
   }
   else { /* different */
      g_free(a);
      g_free(b);
      return 1;
   }
}


/*
 * This routine checks to see if the URL passed is of the absolute form, or
 * of the relative form
 *
 * Return Value:
 *   0 is not absolute, otherwise is absolute
 */
gint a_Url_is_absolute(const gchar *url)
{
   const char *P = strpbrk(url, URN_OTHER);

   return (P && *P == ':'); 
   /*  if (!url->protocol)
    return 0;
    return 1; */
}

/*
 * Parse "http://a/b#c" into "http://a/b" and "#c".
 *
 * Return Value:
 *   a pointer to the last hash (if any), otherwise NULL.
 */
char* a_Url_parse_hash(const DilloURL *Url)
{
   /* todo: I haven't checked this for standards compliance. What's it
    * supposed to do when there are two hashes? */
   /* Just use the last #c --MR-- */

   if (Url->path)
      return strrchr(Url->path, '#');

   return NULL;
}

/*
 * Return TRUE if the method matches.
 */
static gint Url_match_method(const gchar *url, const char *method, 
                             size_t Method_Size)
{
   if (g_strncasecmp(url, method, Method_Size))
      return 0;
   return 1;
}


/*
 * Squeeze an URL (strip /./ and /../ sequences)
 * Return value: squeezed URL.
 *  The funny thing is that I don't know if this is required!
 *  Anyway, it's highly tuned for speed  --Jcid
 */
DilloURL *a_Url_squeeze(gchar *str)
{
   char *s, *p;
   int i, ni, nc;

   s = p = str;
   ni = 0;
   while ( (p = strstr(p, "/.")) != NULL ) {
      if ( p[2] == '.' && (p[3] == '/' || !p[3]) ) { /* "/../" or "/.." */
         nc = p - s;
         for ( i = 0; i <= nc; ++i, ++ni )
            str[ni] = s[i];
         nc = ni > 0 ? --ni : ni;
         while ( ni && str[--ni] != '/' );
         if (!ni || (ni == 6 && !strncmp(str, "http://",7)) )
           ni = nc;   /* parent directory missing, restore value */
         s = p = p + 3;
      } else if ( p[2] == '/' || !p[2] ) {  /* "/./" or "/." */
         nc = p - s;
         for ( i = 0; i < nc; ++i )
            str[ni++] = s[i];
         str[ni] = '/';
         s = p = p + 2;
      } else {                              /* "/.x" */
         p += 2;
      }
   }

   /* Append the rest of 'str' */
   if ( str[ni] == '/' && !*s )  ++ni;
   while ( (str[ni++] = *s++) );
   /*   return str; */
   return a_string_to_Url(str);
}

/*
 * Resolve a "file:" URL
 */
static DilloURL *a_Url_resolve_file(const DilloURL *baseurl, const gchar *RelativeUrl)
{
   gchar *slash;
   const char *rel;
   gchar *BaseUrl = a_Url_to_string(baseurl);
   gchar *NewUrl = NULL;
   
   if ( !BaseUrl || !RelativeUrl )
      return NULL;

   rel = RelativeUrl;
   if ( g_strncasecmp(RelativeUrl, "file:", 5) == 0 ) {
      /* An absolute file-URL! */
      rel = RelativeUrl + 5;
      if ( rel[0] == '/' ) {
         /* It was already solved (todo: squeeze it?) */
         NewUrl = g_strdup(RelativeUrl);
      } else {
         /* File reference to current directory.
          * ("file:" and "file:." show current dir */
         char *cwd = g_get_current_dir();
         if ( (rel[0] == '.' && !rel[1]) )
            ++rel;
         NewUrl = g_strdup_printf("file:%s%s%s", cwd, cwd[1] ? "/" : "", rel);
      }
      return (a_string_to_Url(NewUrl));
   } else if (a_Url_is_absolute(RelativeUrl) ) {
      /* An absolute URL other than "file:" */
      return (a_string_to_Url(RelativeUrl));
   }

   /* If we get here, 'BaseUrl' contains "file:" and 'rel' not */

   if ( rel[0] == '/' ) {
      /* Start from root dir */
      NewUrl = g_strdup_printf("file:%s", rel);
   } else if ( rel[0] == '#' ) {
      /* Name reference, add it to BaseUrl. (todo: strip former '#') */
      NewUrl = g_strdup_printf("%s%s", BaseUrl, RelativeUrl);
   } else {
      /* a file relative to BaseUrl */
      slash = strrchr(BaseUrl, '/');
      if ( !slash ) {
         NewUrl = g_strdup_printf("file:%s", RelativeUrl);
      } else if ( a_Misc_stristr(slash, ".htm") ) {
         char *base = g_strndup(BaseUrl, slash - BaseUrl);
         NewUrl = g_strdup_printf("%s/%s", base, RelativeUrl);
         g_free(base);
      } else {
         NewUrl = g_strdup_printf("%s/%s", BaseUrl, RelativeUrl);
      }
   }

   /*   return NewUrl; */
   return (a_string_to_Url(NewUrl));
}

/*
 * Resolve a relative url into a newly allocated string.
 * This function is relatively tolerant to weird parameters (not 100%) --Jcid
 */
DilloURL *a_Url_resolve_relative(const DilloURL *baseurl, const gchar *RelativeUrl)
{
   gchar *p;
   gint i, path_index;
   gchar *BaseUrl = a_Url_to_string(baseurl);
   gchar *NewUrl = NULL;

   if ( !BaseUrl || !RelativeUrl )
      return NULL;

   /* "file" method here */
   if ( Url_match_method(BaseUrl, "file", 4) ||
        Url_match_method(RelativeUrl, "file", 4) ){
      return (a_Url_resolve_file(baseurl, RelativeUrl));
      // g_print( "FRR New : %s\n", NewUrl);
   }

   if ( a_Url_is_absolute(RelativeUrl) ){
      /* It has the "method:..." form. */
      return (a_string_to_Url(RelativeUrl));
   }

   /* Parse method:/[/]name:port/ in BaseUrl
    * e.g. http://hostname:port/ */
   for (i = 0; BaseUrl[i] && BaseUrl[i] != ':'; i++);
   for (i++; BaseUrl[i] && BaseUrl[i] == '/'; i++);
   for (i++; BaseUrl[i] && BaseUrl[i] != '/'; i++);
   path_index = i;

   if ( RelativeUrl[0] == '/' ){
      /* Get host from BaseUrl */
      if ( i && BaseUrl[i] == '/' ) {
         gchar *base = g_strndup(BaseUrl, i);
         NewUrl = g_strdup_printf("%s%s", base, RelativeUrl);
         g_free(base);
      } else {
         NewUrl = g_strdup_printf("%s%s", BaseUrl, RelativeUrl);
      }
   } else if ( RelativeUrl[0] == '#' ) {
      /* Name reference, add it to BaseUrl. (todo: strip former '#') */
      NewUrl = g_strdup_printf("%s%s", BaseUrl, RelativeUrl);
   } else {
      /* Get host and path from BaseUrl */
      if ( BaseUrl[i] && (p = strrchr(BaseUrl, '/')) != NULL ) {
         gchar *base = g_strndup(BaseUrl, p - BaseUrl);
         NewUrl = g_strdup_printf("%s/%s", base, RelativeUrl);
         g_free(base);
      } else {
         NewUrl = g_strdup_printf("%s/%s", BaseUrl, RelativeUrl);
      }
   }
   return (a_Url_squeeze(NewUrl));
// g_print("URR\n Base: %s\n Rel: %s\n New:%s\n", BaseUrl,RelativeUrl,NewUrl);
   /*   return NewUrl; */
}

/*
 * Parse the url, packing the hostname and port into the arguments, and
 * returning the suffix. Return NULL in case of failure.
 */
char *a_Url_parse(const DilloURL *url, char *hostname, gint *port)
{
   hostname = url->hostname;

   *port = url->port;

   if (url->path)
      return url->path;
   return "";

}

   /*   if (!CPtr || CPtr[1] != '/' || CPtr[2] != '/')
      return NULL;

   CPtr += 3;
   if (!(C1Ptr = strpbrk(CPtr, ":/"))) {
      Size = strlen(CPtr);
      if (!hostname)
         return (char *) CPtr + Size;
      if (Size >= hostname_size)
         return NULL;
      memcpy(hostname, CPtr, Size);
      hostname[Size] = '\0';
      return (char *) CPtr + Size;
   }
   Size = (gulong) C1Ptr - (gulong) CPtr;
   if (hostname) {
      if (Size >= hostname_size)
         return NULL;
      memcpy(hostname, CPtr, Size);
      hostname[Size] = '\0';
   }
   if (*C1Ptr != ':')
      return (char *) C1Ptr;

   if (port)
      *port = strtoul(++C1Ptr, &C1Ptr, 0);

   for (; *C1Ptr && *C1Ptr != '/'; C1Ptr++);
   return C1Ptr; */