
// Written in the D programming language.

/**
 * String handling functions.
 *
 * To copy or not to copy?
 * When a function takes a string as a parameter, and returns a string,
 * is that string the same as the input string, modified in place, or
 * is it a modified copy of the input string? The D array convention is
 * "copy-on-write". This means that if no modifications are done, the
 * original string (or slices of it) can be returned. If any modifications
 * are done, the returned string is a copy.
 *
 * Macros:
 *	WIKI = Phobos/StdString
 * Copyright:
 *	Public Domain
 */

/* Author:
 *	Walter Bright, Digital Mars, www.digitalmars.com
 */

// The code is not optimized for speed, that will have to wait
// until the design is solidified.

/* NOTE: This file is Hatena OS version.

   Modified by Akira Yamaguchi, June 2007
*/

module std.string;

import std.c.string;

/* ************* Constants *************** */

const char[16] hexdigits = "0123456789ABCDEF";			/// 0..9A..F
const char[10] digits    = "0123456789";			/// 0..9
const char[8]  octdigits = "01234567";				/// 0..7
const char[26] lowercase = "abcdefghijklmnopqrstuvwxyz";	/// a..z
const char[26] uppercase = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";	/// A..Z
const char[52] letters   = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
			   "abcdefghijklmnopqrstuvwxyz";	/// A..Za..z
const char[6] whitespace = " \t\v\r\n\f";			/// ASCII whitespace

const dchar LS = '\u2028';	/// UTF line separator
const dchar PS = '\u2029';	/// UTF paragraph separator

/// Newline sequence for this system
const char[1] newline = "\n";

/**********************************
 * Returns true if c is whitespace
 */

bool iswhite(dchar c)
{
    return (c <= 0x7F)
		? find(whitespace, c) != -1
		: (c == PS || c == LS);
}

/**********************************
 * Compare two strings. cmp is case sensitive, icmp is case insensitive.
 * Returns:
 *	<table border=1 cellpadding=4 cellspacing=0>
 *	$(TR $(TD < 0)	$(TD s1 < s2))
 *	$(TR $(TD = 0)	$(TD s1 == s2))
 *	$(TR $(TD > 0)	$(TD s1 > s2))
 *	</table>
 */

int cmp(char[] s1, char[] s2)
{
    auto len = s1.length;
    int result;

    //printf("cmp('%.*s', '%.*s')\n", s1, s2);
    if (s2.length < len)
	len = s2.length;
    result = memcmp(s1.ptr, s2.ptr, len);
    if (result == 0)
	result = cast(int)s1.length - cast(int)s2.length;
    return result;
}

/*********************************
 * ditto
 */

int icmp(char[] s1, char[] s2)
{
    auto len = s1.length;
    int result;

    if (s2.length < len)
	len = s2.length;
    version (Win32)
    {
	result = memicmp(s1.ptr, s2.ptr, len);
    }
    version (linux)
    {
	for (size_t i = 0; i < len; i++)
	{
	    if (s1[i] != s2[i])
	    {
		char c1 = s1[i];
		char c2 = s2[i];

		if (c1 >= 'A' && c1 <= 'Z')
		    c1 += cast(int)'a' - cast(int)'A';
		if (c2 >= 'A' && c2 <= 'Z')
		    c2 += cast(int)'a' - cast(int)'A';
		result = cast(int)c1 - cast(int)c2;
		if (result)
		    break;
	    }
	}
    }
    if (result == 0)
	result = cast(int)s1.length - cast(int)s2.length;
    return result;
}

unittest
{
    int result;

    debug(string) printf("string.cmp.unittest\n");
    result = icmp("abc", "abc");
    assert(result == 0);
    result = icmp(null, null);
    assert(result == 0);
    result = icmp("", "");
    assert(result == 0);
    result = icmp("abc", "abcd");
    assert(result < 0);
    result = icmp("abcd", "abc");
    assert(result > 0);
    result = icmp("abc", "abd");
    assert(result < 0);
    result = icmp("bbc", "abc");
    assert(result > 0);
}

/******************************************
 * find, ifind _find first occurrance of c in string s.
 * rfind, irfind _find last occurrance of c in string s.
 *
 * find, rfind are case sensitive; ifind, irfind are case insensitive.
 * Returns:
 *	Index in s where c is found, -1 if not found.
 */

int find(char[] s, dchar c)
{
    if (c <= 0x7F)
    {	// Plain old ASCII
	auto p = cast(char*)memchr(s.ptr, c, s.length);
	if (p)
	    return p - cast(char *)s;
	else
	    return -1;
    }

    // c is a universal character
    foreach (int i, dchar c2; s)
    {
	if (c == c2)
	    return i;
    }
    return -1;
}

unittest
{
    debug(string) printf("string.find.unittest\n");

    int i;

    i = find(null, cast(dchar)'a');
    assert(i == -1);
    i = find("def", cast(dchar)'a');
    assert(i == -1);
    i = find("abba", cast(dchar)'a');
    assert(i == 0);
    i = find("def", cast(dchar)'f');
    assert(i == 2);
}

/******************************************
 * find, ifind _find first occurrance of sub[] in string s[].
 * rfind, irfind _find last occurrance of sub[] in string s[].
 *
 * find, rfind are case sensitive; ifind, irfind are case insensitive.
 * Returns:
 *	Index in s where c is found, -1 if not found.
 */

int find(char[] s, char[] sub)
    out (result)
    {
	if (result == -1)
	{
	}
	else
	{
	    assert(0 <= result && result < s.length - sub.length + 1);
	    assert(memcmp(&s[result], sub.ptr, sub.length) == 0);
	}
    }
    body
    {
	auto sublength = sub.length;

	if (sublength == 0)
	    return 0;

	if (s.length >= sublength)
	{
	    auto c = sub[0];
	    if (sublength == 1)
	    {
		auto p = cast(char*)memchr(s.ptr, c, s.length);
		if (p)
		    return p - &s[0];
	    }
	    else
	    {
		size_t imax = s.length - sublength + 1;

		// Remainder of sub[]
		char *q = &sub[1];
		sublength--;

		for (size_t i = 0; i < imax; i++)
		{
		    char *p = cast(char*)memchr(&s[i], c, imax - i);
		    if (!p)
			break;
		    i = p - &s[0];
		    if (memcmp(p + 1, q, sublength) == 0)
			return i;
		}
	    }
	}
	return -1;
    }


unittest
{
    debug(string) printf("string.find.unittest\n");

    int i;

    i = find(null, "a");
    assert(i == -1);
    i = find("def", "a");
    assert(i == -1);
    i = find("abba", "a");
    assert(i == 0);
    i = find("def", "f");
    assert(i == 2);
    i = find("dfefffg", "fff");
    assert(i == 3);
    i = find("dfeffgfff", "fff");
    assert(i == 6);
}

/***********************************************
 * Count up all instances of sub[] in s[].
 */

size_t count(char[] s, char[] sub)
{
    size_t i;
    int j;
    int count = 0;

    for (i = 0; i < s.length; i += j + sub.length)
    {
	j = find(s[i .. s.length], sub);
	if (j == -1)
	    break;
	count++;
    }
    return count;
}

unittest
{
    debug(string) printf("string.count.unittest\n");

    char[] s = "This is a fofofof list";
    char[] sub = "fof";
    int i;

    i = count(s, sub);
    assert(i == 2);
}

/***********************************************
 * See if character c is in the pattern.
 * Patterns:
 *
 *	A <i>pattern</i> is an array of characters much like a <i>character
 *	class</i> in regular expressions. A sequence of characters
 *	can be given, such as "abcde". The '-' can represent a range
 *	of characters, as "a-e" represents the same pattern as "abcde".
 *	"a-fA-F0-9" represents all the hex characters.
 *	If the first character of a pattern is '^', then the pattern
 *	is negated, i.e. "^0-9" means any character except a digit.
 *	The functions inPattern, <b>countchars</b>, <b>removeschars</b>,
 *	and <b>squeeze</b>
 *	use patterns.
 *
 * Note: In the future, the pattern syntax may be improved
 *	to be more like regular expression character classes.
 */

bool inPattern(dchar c, char[] pattern)
{
    bool result = false;
    int range = 0;
    dchar lastc;

    foreach (size_t i, dchar p; pattern)
    {
	if (p == '^' && i == 0)
	{   result = true;
	    if (i + 1 == pattern.length)
		return (c == p);	// or should this be an error?
	}
	else if (range)
	{
	    range = 0;
	    if (lastc <= c && c <= p || c == p)
		return !result;
	}
	else if (p == '-' && i > result && i + 1 < pattern.length)
	{
	    range = 1;
	    continue;
	}
	else if (c == p)
	    return !result;
	lastc = p;
    }
    return result;
}


unittest
{
    debug(string) printf("std.string.inPattern.unittest\n");

    int i;

    i = inPattern('x', "x");
    assert(i == 1);
    i = inPattern('x', "y");
    assert(i == 0);
    i = inPattern('x', cast(char[])null);
    assert(i == 0);
    i = inPattern('x', "^y");
    assert(i == 1);
    i = inPattern('x', "yxxy");
    assert(i == 1);
    i = inPattern('x', "^yxxy");
    assert(i == 0);
    i = inPattern('x', "^abcd");
    assert(i == 1);
    i = inPattern('^', "^^");
    assert(i == 0);
    i = inPattern('^', "^");
    assert(i == 1);
    i = inPattern('^', "a^");
    assert(i == 1);
    i = inPattern('x', "a-z");
    assert(i == 1);
    i = inPattern('x', "A-Z");
    assert(i == 0);
    i = inPattern('x', "^a-z");
    assert(i == 0);
    i = inPattern('x', "^A-Z");
    assert(i == 1);
    i = inPattern('-', "a-");
    assert(i == 1);
    i = inPattern('-', "^A-");
    assert(i == 0);
    i = inPattern('a', "z-a");
    assert(i == 1);
    i = inPattern('z', "z-a");
    assert(i == 1);
    i = inPattern('x', "z-a");
    assert(i == 0);
}


/***********************************************
 * See if character c is in the intersection of the patterns.
 */

int inPattern(dchar c, char[][] patterns)
{   int result;

    foreach (char[] pattern; patterns)
    {
	if (!inPattern(c, pattern))
	{   result = 0;
	    break;
	}
	result = 1;
    }
    return result;
}


/********************************************
 * Count characters in s that match pattern.
 */

size_t countchars(char[] s, char[] pattern)
{
    size_t count;

    foreach (dchar c; s)
    {
	count += inPattern(c, pattern);
    }
    return count;
}


unittest
{
    debug(string) printf("std.string.count.unittest\n");

    size_t c;

    c = countchars("abc", "a-c");
    assert(c == 3);
    c = countchars("hello world", "or");
    assert(c == 3);
}
