POS = "pos" # case insensitive!
SEM = "sem"
LIT = "lit"
WORDMAX = "wordmax"
LINEBREAK = "linebreak"
ANYTHING = "anything"
ANYTOKENS = "anytokens"

WEAKLINEBREAK = "weaklinebreak" # stingy linebreak after token
OPTIONAL = "opt" # makes POS, SEM, LIT rules optional

BEGIN_ = "begin"
END_ = "end"
OR_ = "or"

# Constants for the parser (value) side - SEM and POS from Tagger!
MAXANYTOKENS = 99
TOKEN = "token"
TOKOPENSTART = '(?>\s*<)' + TOKEN + '\s*' # poss. space /after/ name
TOKPOSSTART = POS + '=["\']?' # dumbed down!
TOKPOSEND = TOKSEMEND = '["\']?\s*' # dumbed down! space after attr
TOKANYPOS = TOKPOSSTART + '(?>[\w]*' + TOKPOSEND + ')' # TODO - ???
TOKSEMSTART = SEM + '=["\']?' # dumbed down!
TOKANYSEM = TOKSEMSTART + '(?>[\w]*' + TOKSEMEND + ')' # TODO - ???
TOKOPENEND = '>' # dumbed down! no space after opening tag allowed!
TOKOPENANY = TOKOPENSTART + '(?>[^>]*' + TOKOPENEND + ')'
TOKCLOSE = '<\/' + TOKEN + '>' # TODO: dumbed down dangerously! (leading \s)
WORD = '[^<>]+\s*' # dumbed down - TODO: leading \s?
WORDMIN = "0" # minimum number of words for WORDMAX
LB = '(?>\s*<)LINEBREAK/>'
LBPOSS = '(?:(?>\s*<)LINEBREAK/>){0,1}' # possible line break

def optify rule, attributes
	return '(?:'+rule+'){0,1}' if attributes[OPTIONAL]
	rule
end

RULES = {
	POS => proc do | text, attributes |
		# TODO: sem attribute?
		optify(TOKOPENSTART + '(?>[^>]*' + TOKPOSSTART + ')' + text.strip + 
			TOKPOSEND + '(?>[^>]*' + TOKOPENEND + ')' + WORD + TOKCLOSE + 
			LBPOSS + (attributes[WEAKLINEBREAK] ? '?' : ''), attributes)
	end,

	SEM => proc do | text, attributes |
		# TODO: pos attribute?
		optify(TOKOPENSTART + '(?>[^>]*' + TOKSEMSTART + ')' + text.strip + 
			TOKSEMEND + '(?>[^>]*' + TOKOPENEND + ')' + WORD + TOKCLOSE + LBPOSS +
			(attributes[WEAKLINEBREAK] ? '?' : ''), attributes)
	end,	
	
	# LIT tags TODO: figure out if this use suffices!!!
	# TODO: also figure out if we should explicitly incorporate
	# regexp parts into lit tags (which this implementation makes possible!)
	LIT => proc do | text, attributes |
		optify(TOKOPENSTART + '(?>[^>]*' + TOKOPENEND + ')' +
			text.strip + TOKCLOSE + LBPOSS +
			(attributes[WEAKLINEBREAK] ? '?' : ''), attributes)
	end,		

	WORDMAX => proc do | text, attributes |
		# TODO: make this stingy? (colons, line feeds...)
		'(?:' + TOKOPENSTART + '(?>[^>]*' + TOKOPENEND + ')' + WORD + TOKCLOSE +
			LBPOSS + (attributes[WEAKLINEBREAK] ? '?' : '') + "){#{WORDMIN},#{text}}"
	end,

	# Specials...
	
	# Linebreak
	LINEBREAK => proc do | text, attributes |
		LB
	end,
	
	ANYTHING => proc do | text, attributes |
		# WARNING! stingy! expensive!
		'.*?'
	end,
	
	ANYTOKENS => proc do | text, attributes |
		'(?:' + TOKOPENSTART + '[^>]*' + TOKOPENEND + WORD + TOKCLOSE +
			"){#{WORDMIN},#{(attributes['max'] ? attributes['max'] : MAXANYTOKENS)}}?"
	end,
	
	# TODO: These are all trick rules - next revision *must* kill them!
	BEGIN_ => proc do | text, attributes |
		'(?:'
	end,		
	
	END_ => proc do | text, attributes |
		')'
	end,

	OR_ => proc do | text, attributes |
		'|'
	end,
	
	# Default rule (transport it over; maybe the default rule
	# should be taken care of by the algorithm and stored
	# seperately (i.e. not in this hash))
}
