;; @module brill-tagger
;; @description Brill part of speech (POS) tagger interface
;; @version 0.3a
;; @author Lutz Mueller 2008
;;
;;
;; This module interfaces to a modified version of the Brill part of speech (POS) Tagger from:
;; @link http://www.cs.jhu.edu/~brill/RBT1_14.tar.Z http://www.cs.jhu.edu/~brill/RBT1_14.tar.Z <br>
;; The original source code was modified to also compile on Mac OS  X and to
;; suppress some information sent to standard out. This module uses
;; a simple newLISP 'exec' interface to call the tagger. The modified package is available here:
;; @link http://newlisp.org/code/BRILL_TAGGER_NEWLISP_V1.14.tgz BRILL_TAGGER_NEWLISP_V1.14 .
;;
;; <h2>Requirements</h2>
;; Make the Brill Tagger utilities using the modified Brill tagger distribution:
;; RULE_BASED_TAGGER_V1.14-MAC-OSX.tgz. To make for Mac  OS X use 'makefile_osx'
;; to make for other UNIX use the normal 'Makefile'. See the file README.MAC_OSX
;; for details.
;;
;; Bye default program and data files should be in the following places:
;; <pre>
;; /usr/bin/tagger
;; /usr/bin/start-state-tagger
;; /usr/bin/final-state-tagger
;; /usr/share/RULE_BASED_TAGGER_V1.14/LEXICON
;; /usr/share/RULE_BASED_TAGGER_V1.14/BIGRAMS
;; /usr/share/RULE_BASED_TAGGER_V1.14/LEXICALRULEFILE
;; /usr/share/RULE_BASED_TAGGER_V1.14/CONTEXTUALRULEFILE
;; </pre>
;;
;; When using different locations for the data files constants
;; of the same name in the header of 'bill-tagger.lsp' have to
;; be changed.



(context 'brt)

; change thf following to the path/filenames of your Brill Tagger installation
(constant 'LEXICON "/usr/share/RULE_BASED_TAGGER_V1.14/LEXICON")
(constant 'BIGRAMS "/usr/share/RULE_BASED_TAGGER_V1.14/BIGRAMS")
(constant 'LEXICALRULEFILE "/usr/share/RULE_BASED_TAGGER_V1.14/LEXICALRULEFILE")
(constant 'CONTEXTUALRULEFILE "/usr/share/RULE_BASED_TAGGER_V1.14/CONTEXTUALRULEFILE")

; check for data files 
(if (not (file? LEXICON)) (println "cannot find LEXICON"))
(if (not (file? BIGRAMS)) (println "cannot find BIGRAMS"))
(if (not (file? LEXICALRULEFILE)) (println "cannot find LEXICALRULEFILE"))
(if (not (file? CONTEXTUALRULEFILE)) (println "cannot find CONTEXTUALRULEFILE"))

; check for programs
(if (not (file? "/usr/bin/tagger")) (println "cannot find /usr/bin/tagger"))
(if (not (file? "/usr/bin/start-state-tagger")) (println "cannot find /usr/bin/start-state-tagger"))
(if (not (file? "/usr/bin/final-state-tagger")) (println "cannot find /usr/bin/final-state-tagger"))


;; @syntax (rb:tag <str-corpus> [<boolean-flag>])
;; @param <str-corpus> The sentences to be tagged separated by a line feed character.
;; @param <boolean-flag> Set 'true' to see raw output from Brill Tagger.
;; @return An association list of words and their tags in the order they occur in the sentence.
;;
;; The corpus to be tagged should be one sentence per line, with punctuation tokenized.
;; As much text as possible should be tagged at once to minimize overhead.
;;
;; @example
;; (brt:tag "the cat eats the mouse")
;; =&gt (("the" brt:DT) ("cat" brt:NN) ("eats" brt:VBZ) ("the" brt:DT) ("mouse" brt:NN))
;;
;; (brt:tag "the cat eats the mouse" true)
;; =&gt; "the/DT cat/NN eats/VBZ the/DT mouse/NN"
;;

(define (brt:tag phrase flag , command output)
	(set 'input (append "/tmp/" "rb" (uuid)))
	; the tagger utility seems to cut of last character
	(write-file input  (string phrase " "))
	(set 'command (append "tagger " LEXICON " " input " " BIGRAMS " " 
						LEXICALRULEFILE " " CONTEXTUALRULEFILE " 2> /dev/null"))
	(set 'output (exec command))
	(set 'output (if (list? output) (join (map trim output) "\n") ""))
	(delete-file input)
	(if flag
		output
		(begin
			(set 'output (parse output " "))
			(map (fn (wt) (list (first (parse wt "/")) (sym (last (parse wt "/"))))) output)
		)
	)
)


; eof
	



syntax highlighting with newLISP and newLISPdoc