############################################################################
# Autor: 					Jan-Ole Esleben
# e-Mail-Adresse: esleben@cl.uni-heidelberg.de
# Projekt:				Terminparser (beendet WS02)
# 
# Textmodul; parst den Text mit Hilfe des Grammatikmoduls und stellt über 
# eine Schnittstelle die Funde zur Verfügung.
#
# Dokumentation kann mit rdoc extrahiert werden; Kommentare und
# Dokumentation sind aus Gründen der Kürze und Prägnanz in englischer
# Sprache.
############################################################################
# License: cf. file LICENSE.txt distributed with the code
############################################################################

# TODO: MULTILINE matching should be used in order to avoid one
# instance of @text!!!

module Calendar
	TOKEN = "token"
	LINEBREAK = "linebreak"
	TOKSTART = "<token"
	TEXT = "text" # toplevel

	# The Text class is used to access a text (sequentially) and apply rules
	# from a Calendar::Grammar instance to it.
	class Text

		# * +text+ is a marked up text (containing only "token" tags and the 
		# toplevel "text" tag) - one long string!
		# * +grammar+ is an instance of Calendar::Grammar, which must already be in
		# a usable state (i. e.: it must already have a valid ruleset to work
		# with)
		# * <tt>*markup</tt> is a dict of markup specifiers (UNUSED)
		def initialize text, grammar, *markup
			# TODO: use markup			
			@text = text.gsub("\n", " ")
			@dtext = @text.downcase # in order to correctly identify tokens
			@grammar = grammar
			@position = -2
			raise "Grammar not ready!" if grammar.rules.empty?
		end

		# Main method: initiates (usually lengthy) parsing operation
		def process
			while nextToken # loop over tokens
				@grammar.each do | rule | # this /might/ be stupid...
					feedMe rule
				end
				puts "Token @ #{@position}" # TODO: DEBUG
			end
		end
		
		# Extract one +type+ of match from the result;
		# if +score+ is set, the associated block will
		# be passed all matches with a score >= the
		# supplied score.
		#
		# The returned value is an Array containing
		# the score, the text and the start and end
		# index of the match.
		#
		# Example: <tt>extract "vort"</tt> -->
		# <tt>[80, "<token>Heidelberg</token>", 20, 31]</tt>
		def extract type, score=nil
			raise if @reftable.nil?
			return if @reftable[type].nil? # silently fail...
			return @reftable[type].sort.reverse[0] if score.nil?
			@reftable[type].each do | score, text, s, e |
				yield [score, text, s, e] if score >= score
			end
		end

		#######
		private
		#######

		# Note: I use Java naming conventions for private methods (methodName
		# instead of method_name)

		# TODO: the text could actually be "compiled" with all the regexps...
		# keep in mind for version .2 ;)

		def nextToken
			@position = @dtext.index(/#{TOKEN}|#{LINEBREAK}/, @position+2)
			if @position
				@position -= 1
				# @thisend and @thisscore take care of crossover tagging, which
				# leads to invalid XML - rules aren't considered within the
				# boundaries of previous rules if they have a lower or
				# identical score 
				return @position
			end
			false
		end

		def feedMe rule
			# expensive operation: text gets copied each time...
			#==================================================#
			# TODO: this is where optimization is most needed! #
			#==================================================#
			m = rule['regex'].match(@text[@position..-1])
			if m && m.begin(0)==0
				insert rule['type'], rule['score'], m.begin(2), m[2].size 
			end
		end

		# insert tags into output text (preparatory: construct position table)
		def insert type, score, start, len
			# @reftable is a clean and complete list of operations performed 
			@reftable = {} if @reftable.nil?
			s, e = @position+start, @position+start+len
			@reftable[type] = [] if @reftable[type].nil?
			@reftable[type].push [score, @text[s...e], s, e]
		end
	end
end
