/* uri_manipulation.js:  miscellaneous functions for URIs


   Copyright (c) 2008 World Wide Web Consortium, 
   (Massachusetts Institute of Technology, European Research 
   Consortium for Informatics and Mathematics, Keio University). 
   All Rights Reserved. This work is distributed under the 
   W3C(TM) Software License [1] in the hope that it will be 
   useful, but WITHOUT ANY WARRANTY; without even the implied 
   warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

   [1] http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231

*/

/* Revisions:
   2009-03-15 : CMSMcQ : made first version (empty) of this file
*/

// See also uri_constants for relevant static data.

// s_randomexample():  return a random example URI, URI reference, or other
// string from the specs.
function s_randomexample() {
  var n = arr_sampleURIs.length - 1;
  return arr_sampleURIs[random_minmax(0,n)];
}

// s_make_randomurifromtnt(length):  generate a pseudorandom string
// be selecting bits and pieces from the TNT array (which contains
// strings matching all the terminal and non-terminal vocabulary
// of the grammar in RFC 3986).
function s_make_randomurifromtnt(nLen) {
  var n = TNT2.length - 1;
  var c = "";
  var s = "";
  for ( i = 0; i < nLen; i++ ) {
    num = random_minmax(0,n);
    c = TNT2[num];
    s += c;
  }
  return s;    
}

// obj_analyse_URI(s,kw):  analyse a URI or URI reference
// and return an object to serve as an associative array, 
// with information about schema, authority, path, etc.
// Legal kw = 'URI', 'URI-reference'
/* ??? is this dead code? I think so; delete it? */
function obj_analyse_URI(s) {
  var components = new Object();
  var hits = mp_NT_Regex['URI'].exec(s);

  if ( hits == null ) {
    components['valid'] = false;
  } else {
    components['valid'] = true;
    components['string'] = hits[0];
    components['scheme'] = hits[1];
    components['authority'] = hits[2];
    components['userinfo'] = hits[3];
    components['host'] = hits[4];
    components['port'] = hits[5];
    components['path-abempty'] = hits[6];
    components['path-absolute'] = hits[7];
    components['path-rootless'] = hits[8];
    components['path-empty'] = hits[9];
    // components['path'] = ... 
    for (var i = 6; i < 10; i++) {
      if (typeof hits[i] != "undefined") {
	components['path'] = hits[i];
      }
    }
    components['query'] = hits[10];
    components['fragment'] = hits[11];
  }
  return components;
}

// obj_analyse_URI(s,kw):  analyse a string by
// (a) pre-processing it, (b) parsing it using a known regex,
// and (c) post-processing it.
// The known regexes are various expressions for URIs, IRIs, and
// xRI references.
// 
// Return an object to serve as an associative array, 
// with information about schema, authority, path, etc.
// Legal kw = 'URI', 'URI-reference'

function obj_analyse_URIref(s,kw) {
  var components = new Object();

  // 1 pre-process and munge the kw if necessary
  if (kw == 'URI-reference-H5') {
    sPP = s_preprocess_s_kw(s,'H5');
    kw2 = kw;
  } else if (kw == 'URI-reference-WAH5') {
    sPP = s_preprocess_s_kw(s,'WAH5');
    kw2 = 'URI-reference';
  } else {
    sPP = s;
    kw2 = kw;
  }


  // 2 parse using the regex chosen
  var hits = mp_NT_Regex[kw2].exec(sPP);

  components['NT'] = kw;
  components['raw_string'] = s;
  components['preprocessed_string'] = sPP;
  if ( hits == null ) {
    components['valid'] = false;
  } else {
    components['valid'] = true;
    for (var i = 0; i < mp_n_prop[kw2].length; i++) {
      if (typeof hits[i] != "undefined") {
	components[mp_n_prop[kw2][i]] = hits[i];
      }
    }
  }
  if (typeof components['path-abempty'] != "undefined") {
    components['path'] = components['path-abempty'];
  } else if (typeof components['path-absolute'] != "undefined") {
    components['path'] = components['path-absolute'];
  } else if (typeof components['path-rootless'] != "undefined") {
    components['path'] = components['path-rootless'];
  } else if (typeof components['path-empty'] != "undefined") {
    components['path'] = components['path-empty'];
  } 

  // 3 post-process depending on keyword, and return
  if (kw == 'URI-reference-WAH5') {
    return comp_postprocess_comp_kw(components,'WAH5');
  } else {
    return components;
  }
}

function s_preprocess_s_kw(s,kw) {
  switch (kw) {
  case 'H5':
    return s_strip_s(s);
    break;
  case 'WAH5':
    // 1 strip ws
    var sT = s_strip_s(s);

    // 2 percent encode non-URI chars 
    // There may be a clever way to use RegExp.exec() to loop through
    // the string.  But we have to change the string as we go; I 
    // don't see how to do that with a single String.replace().
    // So let's be pedestrian.  Just slam through the string one
    // hit at a time.
    // Do '%' first, though, or you risk multiply encoding things
    sT = sT.replace(/(%)/g,"%25");
    var matchNonURI = sT.match(/([\x00-\x20\x22<>\x5C^`{|}\x7F-\uFFFF])/);
    while (matchNonURI != null) {
      var sL = RegExp.leftContext;
      var sR = RegExp.rightContext;
      // "%".concat(convertCP2UTF8(getCPfromChar("\u30FD")).replace(/ $/,'').replace(/ /g,'%'))

      // Note that because the input string may have characters with
      // code points > xFF, we can't simply take the hex of the character code
      // (Javascript returns a UTF-16 integer).  So we convert to UTF-8
      // then then to hex.  Which is the pure way to do it anyway.
      // Since we are only going to decode the hex later, it doesn't
      // actually matter in principle; if we were using something less
      // UCS-aware than Javascript, it should also work, even though
      // the interim strings might be impure.
      var HH = convertCP2UTF8(getCPfromChar(matchNonURI[0])).replace(/ $/,'').replace(/ /g,'%');
      sT = sL + "%" + HH + sR;
      matchNonURI = sT.match(/([\x00-\x20\x22<>\x5C^`{|}\x7F-\uFFFF])/);      
    }

    // 3 percent encode square brackets, except within IPvFuture (sigh)
    if (sT.match(mp_NT_Regex['schemeplus_or_dblslash'])) {
      // escape [] only after we are past the authority field.
      // find first //, then next following / ? or %
      // then escape [] to the right of that point.
      // for now, though, just do them all
      sT = sT.replace(/\x5B/g,"%5B").replace(/\x5D/g,"%5D");
    } else {
      sT = sT.replace(/\x5B/g,"%5B").replace(/\x5D/g,"%5D");
    }
    
    // 4 percent encode hash occurrences 2..n
    return sT;
    break;
  default:
    // if we don't know the keyword, then do nothing, just return the input
    return s;
    break;
  }
}

function comp_postprocess_comp_kw(c,kw) {
  return c;
}


// function s_findDiscrepancy_mpKwBool(mpKwBool)
// Given an associative array whose keys are the
// members of arr_Rulesets (currently ['fAppA', 'fAppB', 'fH5']),
// find a string that exibits different behavior in the 
// selected rulesets.

// To do:  perhaps give user control over choice of
// generator 

var arr_Props = ['valid','scheme','authority','path','query','fragment'];

function discrepancy_Find_aRules_aGens_Min_MaxS_MaxNT_MaxTries(arr_Rule,arr_Gens,lenMin,lenMax,lenMaxTNT,cMaxTries) {
  var cTries = 0;
  var sT = "no_example_found";
  var fFound = false;
  var resCur;
  var arr_Results = new Array();

  // Continue making attempts until we are out of
  // time or until we have found a string.
  while (!fFound && (cTries < cMaxTries)) {
    // 1 generate a random string
    // 1a choose a generator at random
    var nGenerator = random_minmax(0,arr_Gens.length - 1);

    // 1b generate the string
    switch (arr_Gens[nGenerator]) {
      // var arr_Generators = ['examples', 'ascii', 'ucs', 'octets' 'tnt'];
    case 'examples': // random example
      sT = s_randomexample();
      break;
    case 'ascii':
      var nLen = random_minmax(lenMin,lenMax);
      sT = s_make_random(nLen,'ascii');
      break;
    case 'octets':
      var nLen = random_minmax(lenMin,lenMax);
      sT = s_make_random(nLen,'octets');
      break;
    case 'ucs':
      var nLen = random_minmax(lenMin,lenMax);
      sT = s_make_random(nLen,'ucs');
      break;
    case 'tnt':
      var nLen = random_minmax(lenMin,lenMaxTNT );
      sT = s_make_randomurifromtnt(nLen);
      break;
    }

    // 2 analyse sT using the selected rulesets
    arr_Results.length = 0;
    for (var ruleset in arr_Rule) {
      // the items here should agree with arr_Rulesets[].
      switch (arr_Rule[ruleset]) {
      case 'fAppA':
	resCur = obj_analyse_URIref(sT,'URI-reference');
	break;
      case 'fAppB':
	resCur = obj_analyse_URIref(sT,'URI-reference-AppB');
	break;
      case 'fH5':
	resCur = obj_analyse_URIref(sT,'URI-reference-H5');
	break;
      }
      arr_Results[arr_Results.length] = resCur; 
    }

    // 3 Test:  are the analyses all the same or not?
    // if there are any differences, return sT
    // otherwise, continue

    // assert:  arr_Results has more than one entry.
    if (arr_Results.length < 2) {
      alert("Thud.");
    }
    var resBase = arr_Results[0];
    for (var i = 1; i < arr_Results.length; i++) {
      // compare 0 to each of the other results;
      // any differences, and return the string
      resCur = arr_Results[i];
      
      for (var ix in arr_Props) {
	propName = arr_Props[ix];
	if (typeof resBase[propName] != typeof resCur[propName]) {
	  return discrepancy_Make(sT,
				  "The " + propName + " component has different types.",
				  arr_Results);
	} else if (typeof resBase[propName] == "undefined") {
	  ; // nop
	} else if (resBase[propName] != resCur[propName]) {
	  fFound = true; // pointless, no?
	  return discrepancy_Make(sT,
				  "The " + propName + " component has different values.",
				  arr_Results);
	};
      }
    }
    // assert:  we have found no differences.  So ...
    
    // increment the counter and try again
    cTries++;
  }
  return discrepancy_Make("",
			  "No discrepancy found in " 
			  + cTries + " attempts.");
}

function discrepancy_Make(sURI,sDesc,arr_Results) {
  var d = new Object();
  d['string'] = sURI;
  d['description'] = sDesc;
  if (typeof arr_Results != "undefined") {
    d['analyses'] = arr_Results;
  }
  return d;
}