#include "udm_config.h"

#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <string.h>
#include <errno.h>
#include <ctype.h>
#include <regex.h>
#include <signal.h>
#ifdef HAVE_LIBUTIL_H
#include <libutil.h>
#endif
#if (WIN32|WINNT)
#include <time.h>
#else
#include <sys/time.h>
#endif

#include "udm_common.h"
#include "udm_log.h"
#include "udm_conf.h"
#include "udm_indexer.h"
#include "udm_robots.h"
#include "udm_spell.h"
#include "udm_db.h"
#include "udm_parseurl.h"
#include "udm_charset.h"
#include "udm_parser.h"
#include "udm_proto.h"
#include "udm_hrefs.h"
#include "udm_utils.h"
#include "udm_mutex.h"
#include "udm_crc32.h"
#include "udm_xmalloc.h"

#ifdef NEWS_EXT
#include "udm_parsedate.h"
#endif

static int have_targets=1;
static char *url_file_name=NULL;

#define UDM_THREADINFO(h,s,m)       if(ThreadInfo)ThreadInfo(h,s,m)


/******************** URL number *******************************/
static  int  MaxURLNumber=-1;
static  int  CurURLNumber=0;
__INDLIB__ int UdmGetIndexedNumber(){
int result;
	if(LockProc)LockProc(UDM_LOCK,UDM_LOCK_TARGET);
	result=CurURLNumber;
	if(LockProc)LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
	return(result);
}
__INDLIB__ void UdmSetMaxURLNumber(int n){
	if(LockProc)LockProc(UDM_LOCK,UDM_LOCK_TARGET);
	MaxURLNumber = n;
	if(LockProc)LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
}


/********************  Words stuff *****************************/

#define RESORT_WORDS	256
#define WSIZE		1024

static int cmpwords(const void * v1, const void * v2){
	return(strcmp(((UDM_WORD*)v1)->word,((UDM_WORD*)v2)->word));
}
static int AddOneWord(UDM_INDEXER *Indexer,UDM_SERVER *Server,char *word,int where,int checkstop){
int i;
char *s;
UDM_STOPWORD * stop;
int l,r,c,res,wlen;

	wlen=strlen(word);
	if((wlen>Server->max_word_length)||(wlen<Server->min_word_length))
		return(0);
	s=strdup(word);
	if(strlen(s)>UDM_MAXWORDSIZE)s[UDM_MAXWORDSIZE]=0;
	if(checkstop){
		/* Stopwords checking and language guesser */
		if((stop=UdmIsStopWord(s))){
			int i;
			UdmLog(Indexer->handle,UDM_LOG_DEBUG,"stop: '%s'-'%s'", stop->word, stop->lang);
			/* Add language */
			if(!strcmp(stop->lang,""))
				Indexer->lang[0].count++;
			else
			for(i=1;i<UDM_LANGPERDOC;i++){
				if(!strcmp(Indexer->lang[i].lang,stop->lang)){
					Indexer->lang[i].count++;
					break;
				}
				/* if current lang is free */
				if(!strcmp(Indexer->lang[i].lang,"")){
					strcpy(Indexer->lang[i].lang,stop->lang);
					Indexer->lang[i].count++;
					break;
				}
			}
			free(s);
			return(0);
		}
	}
	/* Find current word in sorted part of word list */
	l=0;r=Indexer->swords-1;
	while(l<=r){
		c=(l+r)/2;
		res=strcmp(Indexer->Word[c].word,s);
		if(res==0){
			Indexer->Word[c].count|=where;
			free(s);
			return(0);
		}
		if(res<0)
			l=c+1;
		else
			r=c-1;
	}
	/* Find in unsorted part */
	for(i=Indexer->swords;i<Indexer->nwords;i++){
		if(!strcmp(Indexer->Word[i].word,s)){
			Indexer->Word[i].count|=where;
			free(s);return(0);
		}
	}
	/* Add new word */
	if(Indexer->nwords>=Indexer->mwords){
		if(Indexer->mwords){
			Indexer->mwords+=WSIZE;
			Indexer->Word=(UDM_WORD *)realloc(Indexer->Word,Indexer->mwords*sizeof(UDM_WORD));
		}else{
			Indexer->mwords=WSIZE;
			Indexer->Word=(UDM_WORD *)malloc(Indexer->mwords*sizeof(UDM_WORD));
		}
	}
	Indexer->Word[Indexer->nwords].word=s;
	Indexer->Word[Indexer->nwords].count=where;
	Indexer->nwords++;

	/* Sort unsorted part */
	if((Indexer->nwords-Indexer->swords)>RESORT_WORDS){
		qsort(Indexer->Word,Indexer->nwords,sizeof(UDM_WORD),cmpwords);
		Indexer->swords=Indexer->nwords;
	}
	return(0);
}

/* This function adds a normalized word form(s) into list using Ispell */
static int AddWord(UDM_INDEXER *Indexer,UDM_SERVER *Server,char *word,int where,int checkstop){
char 	** forms, ** saveforms;
int	have_digit=0;
int	have_alpha=0;

	if(Server->number_factor==0||Server->alnum_factor==0){
		char *s;
		s=word;
		while(*s){
			if(isdigit(*s))
				have_digit=1;
			else	
				have_alpha=1;
			if(have_digit&&have_alpha)break;
			s++;
		}
		if(have_digit){
			if(have_alpha){
				if(!Server->alnum_factor)return(0);
			}else{
				if(!Server->number_factor)return(0);
			}
		}
	}
	UdmTolower(word,Indexer->local_charset);
	if((saveforms=forms=UdmNormalizeWord(word))){
		/* Add all NORMAL forms of the word */
		while(*forms){
			/* Add only if correct words are allowed */
			if(Server->correct_factor){
				AddOneWord(Indexer,Server,*forms,where,checkstop);
			}
			
			free(*forms);
			forms++;
		}
		free(saveforms);
	}else{
		/* If NORMAL forms has not been found  */
		/*   then we will add the word itself  */
		/* Do it only when incorrect words are */
		/* allowed by configuration            */
		if(Server->incorrect_factor)
			AddOneWord(Indexer,Server,word,where,checkstop);
	}
	return(0);
}

static int FreeWords(UDM_INDEXER* Indexer) {
	int i;
	for(i=0;i<Indexer->nwords;i++)
		free(Indexer->Word[i].word);
	Indexer->nwords=0;
	Indexer->swords=0;
	return(0);
}


/**************************** Built-in Parsers ***************************/

static int ParseText(UDM_INDEXER * Indexer,UDM_SERVER * Server,
       char *content,int weight,int check_stopwords){
	char *s, *lt;
	if(weight&&content){
		s=UdmGetWord(content, &lt, Indexer->local_charset);
		while(s){
			AddWord(Indexer, Server, s, weight, check_stopwords);
			s=UdmGetWord(NULL, &lt, Indexer->local_charset);
		}
	}
	return(0);
}


/* HTML parser states */
#define HTML_TAG	1
#define HTML_TXT	2
#define HTML_COM	3

static int ParseHtml(
UDM_INDEXER * Indexer,
UDM_SERVER * CurSrv,
char *content,
UDM_URL * curURL,
UDM_DOCUMENT * doc,
int index,int follow,
char *text,
char *keywords,
char *descript,
char *title){

	int inbody=0;
	int inscript=0;
	int instyle=0;
	int intitle=0;
	int hrefonly=0;
	int comment=0;
	char *htok;
	char str[UDMSTRSIZ]="";
	char res[UDMSTRSIZ]="";

	strcpy(str, curURL->path);
	strcat(str, curURL->filename);
	hrefonly=(UDM_HREFONLY==UdmFilterType(str,res));

	htok=content;

	while(*htok){
		char * href=NULL;
		char * tmp=NULL;
		char * hend;
		char * etmp;
		char * stmp;
		char * s;
		int have_space;
		int opening;
		UDM_TAG tag;
		int state=HTML_TXT;
		int len;


		if(!UDM_STRNCMP(htok,"<!--"))	state=HTML_COM;
		else	if(*htok=='<')		state=HTML_TAG;

		switch(state){
		case HTML_TAG: /* tag */

			href=NULL;
			for(hend=htok;(*hend!='>')&&(*hend);hend++);
			if(*hend=='>')hend++;
			tmp=(char*)malloc(hend-htok+1);
			strncpy(tmp,htok,hend-htok);tmp[hend-htok]=0;
			
			UdmParseTag(&tag,tmp);

			/* Convert to lower case */
			for(s=tag.tag;*s;*s=tolower(*s),s++);

			/* Detect whether opening or closing tag */
			if((opening=(tag.tag[0]!='/')))s=tag.tag;
			else				s=tag.tag+1;

			/* Let's find tag name in order of frequency */

			if(!strcmp(s,"a"))href=tag.href;/*117941*/
			else	if(!strcmp(s,"title"))	intitle=opening;/*6192*/
			else	if(!strcmp(s,"body"))	inbody=opening;	/*5146*/
			else
			if((!strcmp(tag.tag,"meta"))&&(tag.name)&&(tag.content)){
#ifndef USE_CHARSET_GUESSER	/* meta 4160 */
				if((!strcasecmp(tag.name,"Content-Type"))&&(!Indexer->charset)){
					char *p;
					if((p=strstr(tag.content,"charset="))){
						Indexer->charset = p + 8;
						UdmRecode(content,UdmGetCharset(Indexer->charset),Indexer->local_charset);
					}
				}else
#endif
				if(!strcasecmp(tag.name,"refresh")){
					if((href=strstr(tag.content,"URL=")))
						href+=4;
				}else
				if(!hrefonly&&!strcasecmp(tag.name,"keywords")){
					strncpy(keywords,tag.content,UDM_MAXKEYWORDSIZE-1);
					keywords[UDM_MAXKEYWORDSIZE-1]=0;
					if(index&&CurSrv->keywordweight){
						strcpy(str,keywords);
						ParseText(Indexer,CurSrv,str,
							CurSrv->keywordweight,1);
					}
				}else
				if(!hrefonly&&!strcasecmp(tag.name,"description")){
					strncpy(descript,tag.content,UDM_MAXDESCSIZE-1);
					descript[UDM_MAXDESCSIZE-1]=0;
					if(index&&CurSrv->descweight){
						strcpy(str,descript);
						ParseText(Indexer,CurSrv,str,
							CurSrv->descweight,1);
					}
				}else
				if(!strcasecmp(tag.name,"robots")&&CurSrv->userobots&&tag.content){
					char * lt;
					char * s;
					s=UdmGetWord(tag.content,&lt,Indexer->local_charset);
					while(s){
						if(!strcasecmp(s,"ALL")){
							follow=1;index=1;
						}else
						if(!strcasecmp(s,"NONE")){
							follow=0;index=0;
						}else
						if(!strcasecmp(s,"NOINDEX"))
							index=0;
						else
						if(!strcasecmp(s,"NOFOLLOW"))
							follow=0;
						else
						if(!strcasecmp(s,"INDEX"))
							index=1;
						else
						if(!strcasecmp(s,"FOLLOW"))
							follow=1;
						s=UdmGetWord(NULL,&lt,Indexer->local_charset);
					}
				}
			}
			else	if(!strcmp(s,"img"))	href=tag.src;/*2786*/
			else	if(!strcmp(s,"link"))	href=tag.href;/*2241*/
			else	if(!strcmp(s,"frame"))	href=tag.src;
			else	if(!strcmp(s,"script"))	inscript=opening;
			else	if(!strcmp(s,"style"))	instyle=opening;
			else	if(!strcmp(s,"area"))	href=tag.href;
			else	if((!strcmp(s,"base"))&&(tag.href)){
				/* <BASE HREF="xxx"> stuff            */
				/* Check that URL is properly formed  */
				/* baseURL is just temporary variable */
				/* If parsing is fail we'll use old   */
				/* base href, passed via curURL       */
				/* The last is surely correct here    */
				
				/* Note that we will not check BASE     */
				/* if delete_no_server is unset         */
				/* This is  actually dirty hack. We     */
				/* must check that hostname is the same */
				
				
				if((CurSrv->delete_no_server)||(CurSrv->outside)){
					UDM_URL baseURL;
					int res;
					
					if(!(res=UdmParseURL(&baseURL,tag.href))){
						UdmParseURL(curURL,tag.href);
					}else{
						switch(res){
						case UDM_PARSEURL_LONG:
							UdmLog(Indexer->handle,UDM_LOG_ERROR,"BASE HREF too long: '%s'",tag.href);
							break;
						case UDM_PARSEURL_BAD:
						default:
							UdmLog(Indexer->handle,UDM_LOG_ERROR,"Error in BASE HREF URL: '%s'",tag.href);
						}
					}
				}
			}

			if((href)&&(follow)&&(CurSrv->gfollow)){
				UDM_URL newURL;
				int res;
				if(doc->hops>=CurSrv->maxhops){
					UdmLog(Indexer->handle,UDM_LOG_DEBUG,"Skip \"%s\" : too many hops: %d",href,doc->hops);
				}else
				if(!(res=UdmParseURL(&newURL,href))){
					char * newschema;
					int Method;
					char reason[UDMSTRSIZ]="";
					if(newURL.schema[0])newschema=newURL.schema;
					else	newschema=curURL->schema;
					if(!strcmp(newschema,"file")||!strcmp(newschema,"htdb")){
						sprintf(str,"%s:%s%s",newschema,newURL.path[0]?newURL.path:curURL->path,newURL.filename);
					}else{
						sprintf(str,"%s://%s%s%s",
						newURL.schema[0]?newURL.schema:curURL->schema,
						newURL.hostinfo[0]?newURL.hostinfo:curURL->hostinfo,
						newURL.path[0]?newURL.path:curURL->path,newURL.filename);
					}
					if((newschema=strchr(str,':')))
						UdmRemove2Dot(newschema+1);
					
					if(!UDM_STRNCMP(str,"ftp://")&&(strstr(str,";type=")))
						*(strstr(str,";type"))=0;

					Method=UdmFilterType(str,reason);
					UdmLog(Indexer->handle,UDM_LOG_DEBUG,"\"%s\" : '%s'",href,reason);

					if((Method==UDM_DISALLOW)){
					}else
					if((UdmFindRobots(Indexer,str)>=0)&&(CurSrv->userobots)){
						UdmLog(Indexer->handle,UDM_LOG_DEBUG,"Skip \"%s\" : robots",href);
						UdmLog(Indexer->handle,UDM_LOG_DEBUG,"Full URL \"%s\"",str);
					}else
					if(!UdmFindServer(str)){
						UdmLog(Indexer->handle,UDM_LOG_DEBUG,"Skip \"%s\" : no Server",href);
					}else{
						int add=1;
						/* compare hostinfo in some cases */
						if((!CurSrv->delete_no_server)&&(!CurSrv->outside)){
							if(newURL.hostinfo[0])
								add=!strcmp(curURL->hostinfo,newURL.hostinfo);
						}
						if(add){
						
							if(LockProc)LockProc(UDM_LOCK,UDM_LOCK_TARGET);
							
							/* Add URL itself */
							if(UdmAddHref(NULL,str,doc->url_id,doc->hops+1,0))
								have_targets=1;
							
							/* Add robots.txt for HTTP schema */
							/* When FollowOutside or DeleteNoServer no */
							if((!strcmp(newURL.schema,"http"))&&(CurSrv->userobots)&&
							((CurSrv->outside)||(!CurSrv->delete_no_server))){
								char str1[UDMSTRSIZ]="";
								sprintf(str1,"%s://%s/%s",newURL.schema,newURL.hostinfo,"robots.txt");
								if(UdmAddHref(NULL,str1,0,0,0))
									have_targets=1;
							}
							
							if(LockProc)LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
						}
					}
				}else{
					switch(res){
					case UDM_PARSEURL_LONG:
						UdmLog(Indexer->handle,UDM_LOG_DEBUG,"URL too long: '%s'",href);
						break;
					case UDM_PARSEURL_BAD:
					default:
						UdmLog(Indexer->handle,UDM_LOG_DEBUG,"Error in URL: '%s'",href);
					}
				}
			}
			UdmFreeTag(&tag);
			free(tmp);
			htok=hend;
			break;
 
		case HTML_COM: /* comment */

			if(!UDM_STRNCASECMP(htok,"<!--UdmComment-->"))
				comment=1;
			else
			if(!UDM_STRNCASECMP(htok,"<!--/UdmComment-->"))
				comment=0;

			while(*htok){
				if(!UDM_STRNCMP(htok,"-->")){
					htok+=3;
					break;
				}
				htok++;
			}
			break;

		case HTML_TXT: /* text */

			if(inscript){
				/* Special case when script  */
				/* body is not commented:    */
				/* <script> x="<"; </script> */
				/* We should find </script>  */
				/* in this case:             */
				for(hend=htok;*hend;hend++){
					if(!UDM_STRNCASECMP(hend,"</script>"))
						break;
				}
			}else{
				for(hend=htok;(*hend)&&(*hend!='<');hend++);
			}
			tmp=(char*)malloc(2*(hend-htok)+6);
			etmp=tmp;*etmp=0;have_space=0;
			for(stmp=htok;stmp<hend;stmp++){
				char sch;
				switch(*stmp){
				case ' ' : case '\n':
				case '\r': case '\t':
					have_space++; break;

				case '&': /* parse specials */
					if(have_space){
						*etmp=' ';etmp++;
						have_space=0;
					}
					stmp++;
					if(*stmp=='#'){ /* &#234; */
						char *code;
						code=++stmp;
						while((stmp<hend)&&(isdigit(*(stmp+1))))
							stmp++;
						if(*(stmp+1)==';')stmp++;
						*etmp=(char)(atoi(code));
						etmp++;*etmp=0;
					}else
					if((sch=UdmSgmlToChar(stmp,Indexer->local_charset))){
						if(sch==' ')have_space++;
						*etmp=sch;etmp++;*etmp=0;
						while((stmp<hend)&&(isalnum(*(stmp+1))))
							stmp++;
						if(*(stmp+1)==';')stmp++;
					}else{
						*etmp='?';etmp++;*etmp=0;
					}
					break;
				default:
					if(have_space){
						*etmp=' ';etmp++;
						have_space=0;
					}
					*etmp=*stmp;
					etmp++;*etmp=0;
					break;
				}
			}
			if((!comment&&inbody&&!inscript&&!instyle)&&((len=strlen(text))<UDM_MAXTEXTSIZE-2)){
				len=UDM_MAXTEXTSIZE-2-len;
				if(*text)strcat(text," ");
				strncat(text,tmp,len);
				text[UDM_MAXTEXTSIZE-1]=0;
			}
			if((intitle)&&((len=strlen(title))<UDM_MAXTITLESIZE-2)){
				len=UDM_MAXTITLESIZE-2-len;
				if(*title)strcat(title," ");
				strncat(title,tmp,len);
				title[UDM_MAXTITLESIZE-1]=0;
			}
			if(!comment&&!hrefonly&&index&&!inscript&&!instyle){
				ParseText(Indexer,CurSrv,tmp,
					CurSrv->bodyweight*inbody+
					CurSrv->titleweight*intitle,1);
			}
			free(tmp);
			htok=hend;
			break;
		}
	}
	return(0);
}

/****************************************************************/
static int cmplang(const void *s1,const void *s2){
	return(((UDM_LANG*)s2)->count-((UDM_LANG*)s1)->count);
}


static void FreeDoc(UDM_DOCUMENT *Result){
	if(!Result)return;
	UDM_FREE(Result->url);
	UDM_FREE(Result->content_type);
	UDM_FREE(Result->title);
	UDM_FREE(Result->text);
	UDM_FREE(Result->last_index_time);
	UDM_FREE(Result->next_index_time);
	UDM_FREE(Result->keywords);
	UDM_FREE(Result->description);
	free(Result);
}


/*********************** 'UrlFile' stuff (for -f option) *******************/
__INDLIB__ void UdmAddURLFile(char *name){
	UDM_FREE(url_file_name);
	if(name)
		url_file_name=strdup(name);
}

__INDLIB__ int UdmURLFile(UDM_INDEXER *Indexer, int action){
FILE *url_file;
char str[UDMSTRSIZ]="";
char str1[UDMSTRSIZ]="";
int result,res;
UDM_URL myurl;

	/* Read lines and clear/insert/check URLs                     */
	/* We've already tested in main.c to make sure it can be read */
	/* FIXME !!! Checking should be done here surely              */

	if(!strcmp(url_file_name,"-"))
		url_file=stdin;
	else
		url_file=fopen(url_file_name,"r");

	while(fgets(str1,sizeof(str1),url_file)){
		char *end;
		if(!str1[0])continue;
		end=str1+strlen(str1)-1;
		while((end>=str1)&&(*end=='\r'||*end=='\n')){
			*end=0;if(end>str1)end--;
		}
		if(!str1[0])continue;
		if(str1[0]=='#')continue;

		if(*end=='\\'){
			*end=0;strcat(str,str1);
			continue;
		}
		strcat(str,str1);
		strcpy(str1,"");

		switch(action){
		case UDM_URL_FILE_REINDEX:
			UdmAddURLLimit(str);
			if(LockProc)LockProc(UDM_LOCK,UDM_LOCK_TARGET);
			have_targets=1;
			if(LockProc)LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
			result=UdmMarkForReindex(Indexer);
			if(result!=IND_OK)return(result);
			UdmClearURLLimit();
			break;
		case UDM_URL_FILE_CLEAR:
			UdmAddURLLimit(str);
			result=UdmClearDB(Indexer);
			if(result!=IND_OK)return(IND_ERROR);
			UdmClearURLLimit();
			break;
		case UDM_URL_FILE_INSERT:
			if(LockProc)LockProc(UDM_LOCK,UDM_LOCK_TARGET);
			if(UdmAddHref(Indexer,str,0,0,0))
				have_targets=1;
			if(LockProc)LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
			
			break;
		case UDM_URL_FILE_PARSE:
			res=UdmParseURL(&myurl,str);
			if((!res)&&(!myurl.schema[0]))
				res=UDM_PARSEURL_BAD;
			if(res){
				/* INDEXER is NULL: when parsing file */
				/* So we'll use 0 instead of handler  */
				/* in the call of UdmLog              */
				switch(res){
				case UDM_PARSEURL_LONG:
					UdmLog(0,UDM_LOG_ERROR,"URL too long: '%s'",str);
					break;
				case UDM_PARSEURL_BAD:
				default:
					UdmLog(0,UDM_LOG_ERROR,"Error in URL: '%s'",str);
				}
				return(IND_ERROR);
			}
			break;
		}
		str[0]=0;
	}
	if(url_file!=stdin)
		fclose(url_file);
	return(IND_OK);
}


/******* Main indexer functions StartUp & IndexNextURL ********/

/* Init section for indexing */

static int StartUp(UDM_INDEXER * Indexer, int index_flags){
	CurURLNumber=0;
	have_targets=1;
	if(index_flags&UDM_FLAG_MARK){
		if(url_file_name)
			UdmURLFile(Indexer,UDM_URL_FILE_REINDEX);

		UdmMarkForReindex(Indexer);
		if(UdmDBErrorCode(Indexer->db)){
			return(1);
		}
	}
	if((url_file_name) && (index_flags&UDM_FLAG_INSERT)) {
		UdmURLFile(Indexer,UDM_URL_FILE_INSERT);
		if(UdmDBErrorCode(Indexer->db)){
			return(1);
		}
	}
	UdmLoadRobots(Indexer);
	if(UdmDBErrorCode(Indexer->db)){
		return(1);
	}
	UdmLoadStopList(Indexer->db);
	if(UdmDBErrorCode(Indexer->db)){
		return(1);
	}
	if(index_flags&UDM_FLAG_SKIP_LOCKING)
		DBUseLock=0;
	else
		DBUseLock=1;
	return(0);
}


__INDLIB__ int UdmIndexNextURL(UDM_INDEXER *Indexer,int index_flags){
char request[UDMSTRSIZ]="";
char reason[UDMSTRSIZ]="";
udmcrc32_t crc32;

char *lt,*tok;
char *content=NULL,*header,*content_type,*location,*statusline;
char *hcopy=NULL;

int size,status=0,realsize,Method=0;
int origin,follow,index,changed;
int reindex;
int found_in_mirror = 0;
int result=IND_UNKNOWN;

UDM_URL		curURL;
UDM_URL		realURL;
UDM_DOCUMENT	*Doc=NULL;
UDM_SERVER	*CurSrv=0;
UDM_ALIAS	*Alias;

char text[2*UDM_MAXTEXTSIZE]="";
char text_escaped[2*UDM_MAXTEXTSIZE+1]="";
char keywords[UDM_MAXKEYWORDSIZE]="";
char keywords_escaped[2*UDM_MAXKEYWORDSIZE+1]="";
char descript[UDM_MAXDESCSIZE]="";
char descript_escaped[2*UDM_MAXDESCSIZE+1]="";
time_t last_mod_time=0;
char buf[UDM_MAXTIMESTRLEN]="";
char title[UDM_MAXTITLESIZE]="";	
char title_escaped[UDM_MAXTITLESIZE*2+1];
char subj[UDM_MAXTITLESIZE]="";
char from[UDM_MAXKEYWORDSIZE]="";
char content_type_escaped[256]="";

#ifdef NEWS_EXT
/* I need to retain newsgroup-info, so I introduce new variables */
/* I also stick with these fixed-length strings, although I dislike them.*/
char HeaderDate[UDM_MAXDATESIZE] = "";
char HeaderDateEsc[2*UDM_MAXDATESIZE+1] = "";
char HeaderSubject[UDM_MAXSUBJSIZE] = "";
char HeaderSubjectEsc[2*UDM_MAXSUBJSIZE+1] = "";
char HeaderFrom[UDM_MAXFROMSIZE] = "";
char HeaderFromEsc[2*UDM_MAXFROMSIZE+1] = "";
char HeaderGroup[UDM_MAXGROUPSIZE] = "";
char HeaderGroupEsc[2*UDM_MAXGROUPSIZE+1] = "";
char HeaderRefs[UDM_MAXREFSIZE] = "";
char HeaderRefsEsc[2*UDM_MAXREFSIZE+1] = "";
char MessageIdEsc[2*UDM_MAXFROMSIZE+1]="";
char *parent=NULL, *SQLDate;
#endif

#ifdef USE_PARSER
char *mime;
int mimeno;
#endif

	/* Do init stuff if required */
	if(index_flags&UDM_FLAG_INIT){
		if(StartUp(Indexer, index_flags))
			return(IND_ERROR);
		else
			return(IND_OK);
	}else{
		int j;
		/* Clear language statistics */
		for(j=0;j<UDM_LANGPERDOC;j++){
			Indexer->lang[j].lang[0]=0;
			Indexer->lang[j].count=0;
		}
	}

	if(LockProc)LockProc(UDM_LOCK,UDM_LOCK_TARGET);
	/* Store URLs from cache into database */
	if(UdmStoreHrefs(Indexer))
		have_targets=1;
	if(UdmDBErrorCode(Indexer->db)){
		result=IND_ERROR;
	}else
	if(!have_targets){
		result=IND_NO_TARGET;
	}else
	if((CurURLNumber>=MaxURLNumber)&&(MaxURLNumber>=0)){
		result=IND_NO_TARGET;
	}else{
		/* Get Next URL to be indexed from the database */
		UDM_THREADINFO(Indexer->handle,"Selecting","");
		if(!(Doc=UdmGetDocInfo(Indexer,index_flags))){
			if(UdmDBErrorCode(Indexer->db)){
				result=IND_ERROR;
			}else{
				have_targets=0;
				result=IND_NO_TARGET;
			}
		}else{
			/* Increment number of indexed pages */
			CurURLNumber++;
			/* Add current URL in memory cache to avoid */
			/* possible double INSERT INTO url          */
			UdmAddHref(Indexer,Doc->url,0,Doc->hops,1);
			have_targets=1;
		}
	}
	if(LockProc)LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
	/* In GUI version we may send "terminate" signal       */
	/* This line checks whether it already has happened    */
	/* It allows terminate indexing faster                 */
	/* We include this line in several places of this func */
	if(Indexer->action==IND_TERMINATED)result=IND_TERMINATED;
	if(result)return(result);

	/* Alloc buffer for document */
	if(!Indexer->buf){
		Indexer->buf=(char*)malloc(UdmMaxDocSize());
	}

	UdmLog(Indexer->handle,UDM_LOG_INFO,"%s",Doc->url);
	UDM_THREADINFO(Indexer->handle,"Found",Doc->url);
	reindex=index_flags&UDM_FLAG_REINDEX;

#ifdef HAVE_SETPROCTITLE

	/* To see the URL being indexed   */
	/* in "ps" output on FreeBSD      */
	/* Do it if single thread version */

	if(!(Indexer->handle))
		setproctitle("%s",Doc->url);
#endif

	/* Check that URL has valid syntax */
	if(UdmParseURL(&curURL,Doc->url)){
		result=UdmDeleteUrl(Indexer,Doc->url_id);
		UdmLog(Indexer->handle,UDM_LOG_WARN,"Invalid URL: %s ... deleting",Doc->url);
		FreeDoc(Doc);
		return(result);
	}

	/* Find corresponded Server record from indexer.conf */
	if(!(CurSrv=UdmFindServer(Doc->url))){
		if(!strcmp(curURL.filename,"robots.txt")){
			if(IND_OK==(result=UdmDeleteRobotsFromHost(Indexer,curURL.hostinfo)))
				result=UdmLoadRobots(Indexer);
		}else{
			UdmLog(Indexer->handle,UDM_LOG_WARN,"No 'Server' command for url... deleted.");
			result=IND_OK;
		}
		if(result==IND_OK)result=UdmDeleteUrl(Indexer,Doc->url_id);
		FreeDoc(Doc);
		return(result);
	}

	/* Check that hops is less than MaxHops */
	if(Doc->hops>CurSrv->maxhops){
		result=UdmDeleteUrl(Indexer,Doc->url_id);
		UdmLog(Indexer->handle,UDM_LOG_WARN,"Too many hops (%d)... deleted.",Doc->hops);
		FreeDoc(Doc);
		return(result);
	}

	/* Check Allow/Disallow/CheckOnly stuff */

	Method=UdmFilterType(Doc->url,reason);
	UdmLog(Indexer->handle,UDM_LOG_DEBUG,"'%s'",reason);

	if(Method==UDM_DISALLOW){
		result=UdmDeleteUrl(Indexer,Doc->url_id);
		if((result==IND_OK)&&(!strcmp(curURL.filename,"robots.txt"))){
			if(IND_OK==(result=UdmDeleteRobotsFromHost(Indexer,curURL.hostinfo)))
				result=UdmLoadRobots(Indexer);
		}
		FreeDoc(Doc);
		return(result);
	}

	/* Check whether URL is disallowed by robots.txt */
	if((CurSrv->userobots)&&(UdmFindRobots(Indexer,Doc->url)>=0)){
		UdmLog(Indexer->handle,UDM_LOG_WARN,"Deleting URL: robots.txt");
		result=UdmDeleteUrl(Indexer,Doc->url_id);
	}else
	/* Check whether FTP is used without proxy */
/*	if((!strcmp(curURL.schema,"ftp"))&&(!CurSrv->proxy)){
		UdmLog(Indexer->handle,UDM_LOG_WARN,"FTP is available via proxy only, skipped");
		result=UdmUpdateUrl(Indexer,Doc->url_id,504,CurSrv->period);
	}else
*/	/* Check for too many errors on this server */
	if(CurSrv->net_errors>CurSrv->max_net_errors){
		UdmLog(Indexer->handle,UDM_LOG_WARN,"Too many network errors for this server, skipped");
		result=UdmUpdateUrl(Indexer,Doc->url_id,504,CurSrv->period);
	}
	if(result){
		FreeDoc(Doc);
		return(result);
	}

	/* Find alias */
	UdmParseURL(&realURL,Doc->url);
	if((Alias=UdmFindAlias(Doc->url))){
		char rurl[UDMSTRSIZ];
		sprintf(rurl,"%s%s",Alias->replace,Doc->url+strlen(Alias->find));
		if(UdmParseURL(&realURL,rurl)){
			UdmLog(Indexer->handle,UDM_LOG_ERROR,"Error in aliased URL: '%s'",rurl);
			FreeDoc(Doc);
			return(IND_OK);
		}else{
			UdmLog(Indexer->handle,UDM_LOG_EXTRA,"Alias: '%s'",rurl);
		}
	}


	/* Compose HTTP/1.0 request header */
	sprintf(request,"%s %s%s HTTP/1.0\r\n",
		(Method==UDM_HEAD)?"HEAD":"GET",
		CurSrv->proxy?Doc->url:realURL.path,
		CurSrv->proxy?"":realURL.filename);

	/* Add If-Modified-Since header */
	if((Doc->status)&&(Doc->last_mod_time)&&(!reindex)){
		UdmTime_t2HttpStr(Doc->last_mod_time, buf);
		sprintf(UDM_STREND(request),"If-Modified-Since: %s\r\n",
			buf);
	}

	/* HTTP and FTP specific stuff */
	if((!strcmp(realURL.schema,"http"))||(!strcmp(realURL.schema,"ftp"))){

		/* User agent */
		sprintf(UDM_STREND(request),"User-Agent: %s\r\n",UdmUserAgent());

		/* If LocalCharset specified */
		if( Indexer->local_charset )
			sprintf(UDM_STREND(request),"Accept-charset: %s\r\n",
				(char*)UdmCharsetStr(Indexer->local_charset));

		/* Host Name for virtual hosts */
		sprintf(UDM_STREND(request),"Host: %s\r\n",realURL.hostname);

		/* Auth if required */
		if(CurSrv->basic_auth)
			sprintf(UDM_STREND(request),"Authorization: Basic %s\r\n",
				CurSrv->basic_auth);
	}

	/* Add user defined headers */
	strcat(request,UdmExtraHeaders());

	/* Empty line is the end of HTTP header */
	strcat(request,"\r\n");

	size=UDM_NET_UNKNOWN;

	UDM_THREADINFO(Indexer->handle,"Getting",Doc->url);

	if(Indexer->action==IND_TERMINATED)result=IND_TERMINATED;

	/* If mirroring is enabled */
	if (CurSrv->use_mirror >= 0) {
		char errstr[UDMSTRSIZ]="";
		/* on u_m==0 it returned by mtime from mirrorget */
		/* but we knew that it should be put in mirror  */

		size = UdmMirrorGET(realURL.schema, realURL.hostname,
				realURL.path, realURL.filename,
				Indexer->buf, UdmMaxDocSize(),
				CurSrv->use_mirror,errstr);
		if(size>0){
			UdmLog(Indexer->handle,UDM_LOG_DEBUG,"%s has been taken from mirror",Doc->url);
			found_in_mirror=1;
		}else{
			UdmLog(Indexer->handle,UDM_LOG_DEBUG,"%s",errstr);
		}
	}

	/* Get it from the source  */
	/* if no mirror copy found */

	if(!found_in_mirror){
#ifdef USE_HTDB
		if(!strcmp(realURL.schema,"htdb")){
			size = UdmHTDBGet(Indexer,realURL.path,realURL.filename,
				CurSrv->htdb_list,CurSrv->htdb_doc,
				Indexer->buf,UdmMaxDocSize());
			if(UdmDBErrorCode(Indexer->db))result=IND_ERROR;
		}
#endif
#ifdef USE_FILE
		if(!strcmp(realURL.schema,"file")){
			size = UdmFILEGet(request,Indexer->buf,UdmMaxDocSize());
		}
#endif
#ifdef USE_NEWS
		if((!strcmp(realURL.schema,"news"))){
			size = UdmNEWSGet(realURL.hostname,
				realURL.port?realURL.port:119,
				request,Indexer->buf,
				UdmMaxDocSize(),
				CurSrv->read_timeout);
		}else
		if((!strcmp(realURL.schema,"nntp"))){
			size = UdmNNTPGet(realURL.hostname,
				realURL.port?realURL.port:119,
				request,Indexer->buf,
				UdmMaxDocSize(),
				CurSrv->read_timeout);
		}
#endif
#ifdef USE_HTTP
		if((!strcmp(realURL.schema,"http"))){
			size = UdmHTTPGet(
				CurSrv->proxy?CurSrv->proxy:realURL.hostname,
				CurSrv->proxy?CurSrv->proxy_port:(realURL.port?realURL.port:80),
				request,Indexer,
				UdmMaxDocSize(),
				CurSrv->read_timeout);
		}else
		if((!strcmp(realURL.schema,"ftp"))&&(CurSrv->proxy)){
			size = UdmHTTPGet(
				CurSrv->proxy?CurSrv->proxy:realURL.hostname,
				CurSrv->proxy?CurSrv->proxy_port:(realURL.port?realURL.port:80),
				request,Indexer, 
				UdmMaxDocSize(),
				CurSrv->read_timeout);
		}
#endif
#ifdef USE_FTP
		if ((!strcmp(curURL.schema,"ftp"))&&(!CurSrv->proxy)){
		    size = UdmFTPGet(realURL.hostname,realURL.port?realURL.port:21,
			curURL.path, curURL.filename[0]=='\0'?NULL:curURL.filename, CurSrv->user,
			CurSrv->passwd, Indexer, UdmMaxDocSize(),
			CurSrv->read_timeout, Doc->last_mod_time, (Method==UDM_HEAD?1:0));
		}
#endif
	}

	if(!result) /* Extra check for HTDB possible bad result */
	switch(size){
	case UDM_NET_UNKNOWN:
		UdmLog(Indexer->handle,UDM_LOG_WARN,"Protocol not supported");
		status=UDM_HTTP_STATUS_NOT_SUPPORTED;
		result=IND_OK;
		break;
	case UDM_NET_TIMEOUT:
		UdmLog(Indexer->handle,UDM_LOG_WARN,"Download timeout");
		status=UDM_HTTP_STATUS_TIMEOUT;
		result=IND_OK;
		break;
	case UDM_NET_CANT_CONNECT:
		UdmLog(Indexer->handle,UDM_LOG_WARN,"Can't connect to host %s:%d",
		CurSrv->proxy?CurSrv->proxy:curURL.hostname,
		CurSrv->proxy?CurSrv->proxy_port:(curURL.port?curURL.port:80));
		status=UDM_HTTP_STATUS_UNAVAIL;
		result=IND_OK;
		break;
	case UDM_NET_CANT_RESOLVE:
		UdmLog(Indexer->handle,UDM_LOG_WARN,"Unknown host %s",
			CurSrv->proxy?CurSrv->proxy:curURL.hostname);
		status=UDM_HTTP_STATUS_UNAVAIL;
		result=IND_OK;
		break;
	default:
		if(size<0){	/* No connection */
			UdmLog(Indexer->handle,UDM_LOG_WARN,"Can't connect to host %s:%d",
				CurSrv->proxy?CurSrv->proxy:curURL.hostname,
				CurSrv->proxy?CurSrv->proxy_port:(curURL.port?curURL.port:80));
			status=UDM_HTTP_STATUS_UNAVAIL;
			result=IND_OK;
		}else{
			/* Document has been successfully loaded */
			/* Cut HTTP response header first        */
			if((content=strstr(Indexer->buf,"\r\n\r\n"))){
				*content = 0;
				content +=4;
			} else if ((content=strstr(Indexer->buf,"\n\n"))){
				*content = 0;
				content +=2;
			}
			if(!content){
				UdmLog(Indexer->handle,UDM_LOG_ERROR,"Illegal HTTP headers in response");
				status=UDM_HTTP_STATUS_UNAVAIL;
				result=IND_OK;
			}
		}
	}

	/* Check again whether "terminate" was pressed */
	if(Indexer->action==IND_TERMINATED)result=IND_TERMINATED;

	/* Exit if there was an error while downloading */
	if(result){
		CurSrv->net_errors++;
		if(result!=IND_ERROR)
			result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
		FreeDoc(Doc);
		return(result);
	}

	/* Let's start parsing */
	Indexer->buf[size]=0;header=Indexer->buf;
	realsize=size;
	status=changed=1;origin=0;
	content_type=location=Indexer->charset=statusline=NULL;
	crc32=0;text[0]=0;title[0]=0;keywords[0]=0;
	descript[0]=0;subj[0]=0;last_mod_time=0;from[0]=0;
	last_mod_time=Doc->last_mod_time;
	

	index	=CurSrv->gindex;
	follow	=CurSrv->gfollow;

	size-=(content-Indexer->buf);     /* Could be modified by Content-Length    */
	realsize-=(content-Indexer->buf); /* Will be safe for crc32, Parsers etc   */

	UDM_THREADINFO(Indexer->handle,"Parsing",Doc->url);
	/* Now lets parse response header lines */
	hcopy = strdup(header);
	tok=UdmGetToken(header,"\r\n",&lt);
	while(tok){
		if(!UDM_STRNCASECMP(tok,"HTTP/")){
			status=atoi(tok+8);
			statusline=tok;
		}else
		if ((Force1251) && (!UDM_STRNCASECMP(tok,"Server: "))){
			char * sname;
			sname=UdmTrim(tok+7," ");
			if (!UDM_STRNCASECMP(sname,"Microsoft")||
				!UDM_STRNCASECMP(sname,"IIS"))
					Indexer->charset="windows-1251";
		}else
		if(!UDM_STRNCASECMP(tok,"Content-Type: ")){
			char *p;
			content_type=tok+14;
			if((p=strstr(content_type,"charset=")))
				Indexer->charset = p + 8;
			/* Store content_type right now  */
			/* It can be modified  after     */
			/* possible external parser call */
			escstr(content_type_escaped,content_type);
		}else
		if(!UDM_STRNCASECMP(tok,"Location: ")){
			location=tok+10;
		}else
		if(!UDM_STRNCASECMP(tok,"Content-Length: ")){
			size=atoi(tok+16);
		}else
		if(!UDM_STRNCASECMP(tok,"Subject: ")){
			UDM_STRNCPY(title,tok+9);
			strcpy(subj,title);
#ifdef NEWS_EXT
			strncpy(HeaderSubject,tok+9,UDM_MAXSUBJSIZE);
#endif
		}else
		if(!UDM_STRNCASECMP(tok,"From: ")){
			UDM_STRNCPY(from,tok+6);
#ifdef NEWS_EXT
			strncpy(HeaderFrom,tok+6,UDM_MAXFROMSIZE);
#endif
		}else
		if(!UDM_STRNCASECMP(tok,"Newsgroups: ")){
			UDM_STRNCPY(keywords,tok+12);
#ifdef NEWS_EXT
			strncpy(HeaderGroup,tok+12,UDM_MAXGROUPSIZE);
#endif
		}else
#ifdef NEWS_EXT
		if(!UDM_STRNCASECMP(tok,"Date: ")){
			strncpy(HeaderDate,tok+6,UDM_MAXDATESIZE);
		}else
		if(!UDM_STRNCASECMP(tok,"References: ")){
			strncpy(HeaderRefs,tok+12,UDM_MAXREFSIZE);
		}else
#endif
		if(!UDM_STRNCASECMP(tok,"Last-Modified: ")){
			last_mod_time=UdmHttpDate2Time_t(tok+15);
		}
		UdmLog(Indexer->handle,UDM_LOG_DEBUG,"%s",tok);
		tok=UdmGetToken(NULL,"\r\n",&lt);
	}
	UdmLog(Indexer->handle,UDM_LOG_EXTRA,"%s %s %d",statusline?statusline:"?",content_type?content_type:"?",size);


#ifdef NEWS_EXT
	/* do correct string termination */
	HeaderDate[UDM_MAXDATESIZE-1] = 0;
	HeaderFrom[UDM_MAXFROMSIZE-1] = 0;
	HeaderSubject[UDM_MAXSUBJSIZE-1] = 0;
	HeaderGroup[UDM_MAXGROUPSIZE-1] = 0;
	HeaderRefs[UDM_MAXREFSIZE-1] = 0;
	if((SQLDate=UdmParseDate(HeaderDate))){
		escstr(HeaderDateEsc,UdmParseDate(HeaderDate));
		free(SQLDate);
	}
	/* Escape Headers */
	escstr(HeaderFromEsc,HeaderFrom);
	escstr(HeaderSubjectEsc,HeaderSubject);
	escstr(HeaderGroupEsc,HeaderGroup);
	escstr(HeaderRefsEsc,HeaderRefs);
#endif

	switch(UdmHTTPResponseType(status)){
	case 1: /* No HTTP code */
		CurSrv->net_errors++;
		UdmLog(Indexer->handle,UDM_LOG_ERROR,"No HTTP response status");
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
		break;

	case UDM_HTTP_STATUS_OK:
		if(!content_type){
			UdmLog(Indexer->handle,UDM_LOG_ERROR,"No Content-type in '%s'!",Doc->url);
			CurSrv->net_errors++;
			result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
		}
		break;

	case UDM_HTTP_STATUS_REDIRECT: /* We'll try to use Location: xxx instead */

		if((Doc->hops<CurSrv->maxhops)&&(location)){
			int newMethod;
			newMethod=UdmFilterType(location,reason);
			if(newMethod!=UDM_DISALLOW){
				UDM_URL	newURL;
				int res;
				if(!(res=UdmParseURL(&newURL,location))){
					if(UdmFindServer(location)){
						if((!CurSrv->delete_no_server)&&(!CurSrv->outside)){
							/* compare hostinfo in some cases */
							if(!strcmp(curURL.hostinfo,newURL.hostinfo)){
								if(LockProc)LockProc(UDM_LOCK,UDM_LOCK_TARGET);
								if(UdmAddHref(Indexer,location,Doc->url_id,Doc->hops+1,0))
									have_targets=1;
								if(LockProc)LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
							}
						}else{
							if(LockProc)LockProc(UDM_LOCK,UDM_LOCK_TARGET);
							if(UdmAddHref(Indexer,location,Doc->url_id,Doc->hops+1,0))
								have_targets=1;
							if(LockProc)LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
						}
					}
				}else{
					switch(res){
					case UDM_PARSEURL_LONG:
						UdmLog(Indexer->handle,UDM_LOG_ERROR,"Redirect URL too long: '%s'",location);
						break;
					case UDM_PARSEURL_BAD:
					default:
						UdmLog(Indexer->handle,UDM_LOG_ERROR,"Error in redirect URL: '%s'",location);
					}
				}
			}
		}
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
		break;

	case UDM_HTTP_STATUS_NOT_MODIFIED:  /* Not Modified, nothing to do */
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
		break;

	case UDM_HTTP_STATUS_DELETE:
		/* delete it if not robots.txt */
		UdmLog(Indexer->handle,UDM_LOG_EXTRA,"Deleting URL");
		if(!strcmp(curURL.filename,"robots.txt")){
			result=UdmDeleteRobotsFromHost(Indexer,curURL.hostinfo);
			if(result==IND_OK)result=UdmLoadRobots(Indexer);
		}
		if(result!=IND_ERROR){
			if(CurSrv->deletebad){
				result=UdmDeleteUrl(Indexer,Doc->url_id);
			}else{
				result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
			}
		}
		break;

	case UDM_HTTP_STATUS_RETRY: /* We'll retry later, maybe host is down */
		CurSrv->net_errors++;
		UdmLog(Indexer->handle,UDM_LOG_EXTRA,"Could not read URL");
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
		break;

	default: /* Unknown status, retry later */
		UdmLog(Indexer->handle,UDM_LOG_WARN,"HTTP %d We don't yet know how to handle it, skipped",status);
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
	}
	
	/* Check again whether "terminate" pressed */
	if(Indexer->action==IND_TERMINATED)result=IND_TERMINATED;

	/* Return if Content parsing is not required */
	if(result){
		FreeDoc(Doc);
		return(result);
	}

	/* Now we have HTTP_OK and know Content-Type */
	/* Lets start parse the body                 */

	/* Put into mirror if required */
	if ((CurSrv->use_mirror>=0)&&(!found_in_mirror)){
		char errstr[UDMSTRSIZ]="";
		/* I tried to get it from mirror, but nothing was there  */
		/* or it was expired. Now we will store it on the mirror */
		if(UdmMirrorPUT(realURL.schema, realURL.hostname,
				realURL.path, realURL.filename,
				hcopy, content, size,errstr)){
			UdmLog(Indexer->handle,UDM_LOG_DEBUG,"%s",errstr);
		}
	}
	UDM_FREE(hcopy);


#ifdef USE_PARSER
	/* Let's try to start external parser for this Content-Type */
	if((mime=UdmExecParser(content_type,&mimeno,content,realsize,UdmMaxDocSize()-(content-Indexer->buf)))){
		content=mime;
		content_type=UdmParserParam(mimeno,UDM_MIME_TO);
		if((mime=UdmParserParam(mimeno,UDM_MIME_CHARSET)))
			Indexer->charset=mime;
	};
#endif

	/* robots.txt */
	if(!UDM_STRNCASECMP(content_type,"text/plain")&&
	(!strcmp(curURL.filename,"robots.txt"))){
		result=UdmParseRobots(Indexer,content,curURL.hostinfo);
		if(result!=IND_ERROR)result=UdmLoadRobots(Indexer);
		if(result==IND_ERROR){
			FreeDoc(Doc);
			return(IND_ERROR);
		}
	}else

	/* plain TEXT or the same */
	if(!UDM_STRNCASECMP(content_type,"text/plain")||
		!UDM_STRNCASECMP(content_type,"text/tab-separated-values")||
		!UDM_STRNCASECMP(content_type,"text/css"))
	{
		if(Indexer->charset)	{
			UdmRecode(content,UdmGetCharset(Indexer->charset),Indexer->local_charset);
		} else {
			int DCindex=0;
#ifdef USE_CHARSET_GUESSER
			DCindex = UdmGuessCharset(content, UdmGetCharset(CurSrv->charset));
#else
			DCindex = UdmGetCharset(CurSrv->charset);
#endif
			UdmRecode(content,DCindex,Indexer->local_charset);
		}
		if(Method!=UDM_HEAD){
		        /* MD5_DIGEST(content,realsize,digest); */
			/* changed=strcmp(digest,Doc->crc); */
			crc32=UdmCRC32(content, realsize);
			changed=!(crc32==Doc->crc32);
			if(CurSrv->use_clones){
				origin=UdmFindOrigin(Indexer, crc32, size);
				origin=((origin==Doc->url_id)?0:origin);
			}
		}
		if(((index)&&(!origin)&&(changed))||reindex){
			strncpy(text,content,UDM_MAXTEXTSIZE-2);
			text[UDM_MAXTEXTSIZE-1]=0;
			ParseText(Indexer,CurSrv,content,CurSrv->bodyweight,1);
		}
	}else

	/* HTML text */
	if(!UDM_STRNCASECMP(content_type,"text/html")){

		if (Indexer->charset) {
			UdmRecode(content,UdmGetCharset(Indexer->charset),Indexer->local_charset);
		} else {
			int DCindex=0;
#ifdef USE_CHARSET_GUESSER
			DCindex = UdmGuessCharset(content, UdmGetCharset(CurSrv->charset));
#else
			DCindex = UdmGetCharset(CurSrv->charset);
#endif
			UdmRecode(content,DCindex,Indexer->local_charset);
		}
		if(Method!=UDM_HEAD){
			/*
			MD5_DIGEST(content,realsize,digest);
			changed=strcmp(digest,Doc->crc);
			*/
			crc32=UdmCRC32(content, realsize);
			changed=!(crc32==Doc->crc32);
			if(CurSrv->use_clones){
				origin=UdmFindOrigin(Indexer, crc32, size);
				origin=((origin==Doc->url_id)?0:origin);
			}
		}
		
		if(((index||follow)&&(!origin)&&(changed))||reindex){
			ParseHtml(Indexer,CurSrv,content,&curURL,
				Doc,index,follow,text,keywords,descript,title);
		}
	}else{
		/* Unknown Content-Type */
		index=0; /* What this line for? */
		if(Method!=UDM_HEAD){
			/*
			MD5_DIGEST(content,realsize,digest);
			changed=strcmp(digest,Doc->crc);
			*/
			crc32=UdmCRC32(content, realsize);
			changed=!(crc32==Doc->crc32);

			if(CurSrv->use_clones){
				origin=UdmFindOrigin(Indexer, crc32, size);
				origin=((origin==Doc->url_id)?0:origin);
			}
		}
	}

	if (strcmp(curURL.filename,"robots.txt")) {
		char str[UDMSTRSIZ];
		if(CurSrv->urlweight) {
			strcpy(str,Doc->url);
			ParseText(Indexer,CurSrv,str,CurSrv->urlweight,1);
		} else {
			if(CurSrv->urlhostweight) {
				strcpy(str,curURL.hostname);
				ParseText(Indexer,CurSrv,str,CurSrv->urlhostweight,1);	
			}
			if(CurSrv->urlpathweight) {
				strcpy(str,curURL.path);
				ParseText(Indexer,CurSrv,str,CurSrv->urlpathweight,1);
			}
			if(CurSrv->urlfileweight) {
				strcpy(str,curURL.filename);
				ParseText(Indexer,CurSrv,str,CurSrv->urlfileweight,1);
			}
		}   
	}

	UDM_THREADINFO(Indexer->handle,"Updating",Doc->url);
	if(origin){
		UdmLog(Indexer->handle,UDM_LOG_EXTRA,"Duplicate Document with #%d",origin);
		result=UdmDeleteWordFromURL(Indexer,Doc->url_id);
		if(result==IND_OK)
			result=UdmUpdateClone(Indexer,Doc->url_id,status,CurSrv->period,
				content_type_escaped,last_mod_time,crc32);
	}else
	if((!changed)&&(!reindex)){
		result=UdmUpdateUrl(Indexer,Doc->url_id,status,CurSrv->period);
	}else{
		if(subj[0]){
			char str[UDMSTRSIZ];
			strcpy(str,subj);
			ParseText(Indexer,CurSrv,str,CurSrv->titleweight,1);
		}
		if((result=UdmStoreWords(Indexer,Doc->url_id, Doc->status))==IND_OK){
			if(LockProc)LockProc(UDM_LOCK,UDM_LOCK_TARGET);
			if(UdmStoreHrefs(Indexer))
				have_targets=1;
			if(LockProc)LockProc(UDM_UNLOCK,UDM_LOCK_TARGET);
			if(UdmDBErrorCode(Indexer->db))
				result=IND_ERROR;
			else
				result=IND_OK;
		}
		if(result!=IND_ERROR){
			int total,i;
			char str[UDMSTRSIZ]="";
			char lang[3]="";
			char *s;

			/* Prepare text,keywords,description */
			if(!strcmp(realURL.schema,"news"))strcpy(keywords,from);

			s=text;
#ifndef NEWS_EXT
			while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
#endif
			text[UDM_MAXTEXTSIZE-1]=0;
			escstr(text_escaped,text);

			s=keywords;while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
			keywords[UDM_MAXKEYWORDSIZE-1]=0;
			escstr(keywords_escaped,keywords);

			s=descript;while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
			descript[UDM_MAXDESCSIZE-1]=0;
			escstr(descript_escaped,descript);

			s=title;while(*s){if(strchr("\t\n\r",*s))*s=' ';s++;}
			title[UDM_MAXTITLESIZE-1]=0;
			escstr(title_escaped,title);
			
			

			/* Guess the language */
			qsort((void*)(Indexer->lang+1),UDM_LANGPERDOC-1,sizeof(UDM_LANG),cmplang);
			total=0;
			for(i=1;i<UDM_LANGPERDOC;i++)
				total+=Indexer->lang[i].count;
			if(total){
				sprintf(str,"total: %d",total);
				for(i=1;i<UDM_LANGPERDOC;i++){
					if(Indexer->lang[i].count)
						sprintf(UDM_STREND(str)," %s:%d (%d%%)",
						Indexer->lang[i].lang,
						Indexer->lang[i].count,
						(int)(Indexer->lang[i].count*100/total));
				}
				UdmLog(Indexer->handle,UDM_LOG_DEBUG,"language: %s",str);
				if(Indexer->lang[1].count>10){
					strncpy(lang,Indexer->lang[1].lang,2);
					lang[2]=0;
				}
			}
#ifdef NEWS_EXT
			/*
			extract message id from url
			thanks guys, the message id is in realURL.filename !!
			valid message ids have an @ character
			*/

			if(strchr(realURL.filename,'@'))
				escstr(MessageIdEsc,realURL.filename);
						
			/* get rec_id from my parent out of db (if I have one...) */

			if(strlen(HeaderRefs))
			{
				/* HeaderRefs contains all message ids of my predecessors, space separated*/
				/* my direct parent is the last in the list*/
				if((parent = strrchr(HeaderRefs,' ')))
				{	
					/* parent now points to the space character */
					/* skip it */
					++parent;
				}
				else
				{
					/* there is only one entry in HeaderRefs, so this is my parent */
					parent=HeaderRefs;	
				}	
			}
			result = 0;
			/* if the parent is really a message id, */
			/* get its rec_id from database          */
			if(parent && strlen(parent) && strchr(parent,'@'))
				result = UdmFindMsgID(Indexer,parent);	
			/*
			now register me with my parent
			result is -1 if no parent was found
			*/
#ifdef HEIKODEBUG
			fprintf(stderr,"%s, me: %d, parent: %d\n",parent,Doc->url_id,result);
#endif
			if(result > 0)
				result = UdmRegisterChild(Indexer,result,Doc->url_id);

			result=UdmLongUpdateUrl(Indexer,Doc->url_id,status,
				changed,size,CurSrv->period,
				CurSrv->tag?CurSrv->tag:"",
				index,last_mod_time,
				text_escaped,title_escaped,
				content_type_escaped,keywords_escaped,
				descript_escaped,crc32,lang,
				CurSrv->category?CurSrv->category:"",
				HeaderDateEsc,
				HeaderSubjectEsc,
				HeaderFromEsc,
				HeaderGroupEsc,
				HeaderRefsEsc,
				MessageIdEsc);
#else
			result=UdmLongUpdateUrl(Indexer,Doc->url_id,status,changed,size,CurSrv->period,
				CurSrv->tag?CurSrv->tag:"",
				index,last_mod_time,
				text_escaped,title_escaped,
				content_type_escaped,keywords_escaped,
				descript_escaped,crc32,lang,
				CurSrv->category?CurSrv->category:"");
#endif			
		}
	}
	FreeWords(Indexer);
	FreeDoc(Doc);
	return(result);
}


/************ Misc functions *****************************/


__INDLIB__ UDM_INDEXER * UdmAllocIndexer(int handle){
UDM_INDEXER * result;
	result=(UDM_INDEXER*)malloc(sizeof(UDM_INDEXER));
	memset(result, 0, sizeof(UDM_INDEXER));
	result->mwords=0;
	result->nwords=0;
	result->swords=0;
	result->handle=handle; /* Handle is used in multi-threaded version */
	result->action=0;
	result->state[0]=0;
	result->Word=NULL;
	result->buf=NULL;
	result->charset=NULL;
	result->local_charset=UdmGetDefaultCharset();
	result->db=UdmAllocDB(UDM_OPEN_MODE_WRITE);
        /* Reserve connection structure */
        result->connp = (UDM_CONN*)UdmXmalloc(sizeof(UDM_CONN));
        result->connp->indexer = result;
        result->connp->connp = (UDM_CONN*)UdmXmalloc(sizeof(UDM_CONN));
        result->connp->connp->indexer = result;
	return(result);
}
__INDLIB__ void UdmFreeIndexer(UDM_INDEXER *Indexer){

	if(Indexer){
		FreeWords(Indexer);
		UDM_FREE(Indexer->Word);
		UDM_FREE(Indexer->buf);
		UdmFreeDB(Indexer->db);
		free(Indexer);
	}
}
__INDLIB__ int UdmClearDatabase(UDM_INDEXER * Indexer){
	return(UdmClearDB(Indexer));
}

__INDLIB__ char * UdmIndexerErrorMsg(UDM_INDEXER *Indexer){
	return(UdmDBErrorMsg(Indexer->db));
}

__INDLIB__ void UdmSetAction(UDM_INDEXER *Indexer,int action){
	Indexer->action=action;
}
