#!/usr/bin/newlisp ;; Creative Commons Attribution (by) License v2.5 ;; Full text - http://creativecommons.org/licenses/by/2.5/ ;; Contact - gordon.fischer@gmail.com, desanto@mac.com ;; Copyright (c) 2006, kozoru, Inc. ;; Updated to newLISP v.10.0, L.M. May 2009 ;; SentenceBoundary.lsp ;; $Rev: 2701 $ ;; - Uses contexts CLEAN and BOUNDARY ;; - Supplies the following function in MAIN ;; (url-to-sentences "http://en.wikipedia.org/wiki/Grady_(band)" 15000) ;; (url-to-sentences "http://www.bbc.co.uk/drama/spooks/series4_ep10.shtml" 10000) ;; - (CLEAN:clean-html str token) ;; If token is not nil, token will be inserted at every guaranteed sentence ;; break (ie. after

etc) ;; - (BOUNDARY:GetSentences str) ;; Breaks str on sentence boundaries. Can be used with CLEAN:clean-html to ;; leverage HTML markup to ensure sentence breaks on known HTML characters. ;; (eg. (CLEAN:clean-html str BOUNDARY:g_break_token) ) (context 'CLEAN) ;; ;; Global Configuration ;; ;; The string that we use to insert upon a known sentence boundary mark (SBM). ;; We include a space after the marker to ensure proper word separation. (set 'g_sbm "SENTBRK ") (set 'g_start_tags (list "" "" "" "" "} {} {} {} {} {} {} {} {} {} {} {} )) ;; will use newlisp replace (set 'g_other_tags (list {
} {
} {
} {
} {

} {

} {} {} {} {} {} {} {} {} {} {} {} {} {
} {
} )) ;; will use newlisp replace (set 'g_break_tags (list "" )) ;; will use PCRE replace (set 'g_table_tags (list {} {} {} {} )) ;; will use newlisp replace (set 'g_ascii_translate '( ;; these are removed ({"} "") ("[edit]" "") (" ." "") ;; these are converted to spaces ("_" " ") (" " " ") (" " " ") ;; these are special characters (". ." ".") (".." ".") (":.." ":") ("’" {'}) ("\146" "'") ("“" {"}) ("”" {"}) ("—" {-}) ("\151" "-") (" " " ") ("&" "&") (""" {"}) ("—" {-}) ;; these are some UTF-8 characters ("\226\128\148" " - ") ("\226\128\147" " - ") ("\194\163" "£") ; english pound character ("\195\169" "é") ; accented e ("\226\128\162" "•") ; bullet ("\226\128\153" "'") ; reverse tick )) (define (asciify str) (dolist (a g_ascii_translate) (replace (a 0) str (a 1)) ) str ) (define (clean-html data break_token, endHead pos) (unless break_token (set 'break_token "")) ; remove nils (replace "\\000" data "" 0) ;(replace "\n" data " ") ;; strip header (replace {.*} data "" 512) ;; strip text outside marked content (set 'endHead (find "" data)) (if endHead (set 'data (slice data endHead))) ;; strip footer (set 'pos (find {
} data 1)) (if pos (set 'data (slice data 0 pos))) ;; strip javascript and html comments (replace {} data "" 513) (replace {} data "" 513) (replace {} data "" 513) ;; Sentence Boundary Helpers (dolist (t g_start_tags) (replace t data g_sbm 513)) ;; case insensitive + non-greedy (dolist (t g_heading_tags) (replace t data g_sbm)) (dolist (t g_break_tags) (replace t data g_sbm 513)) ;; case insensitive + non-greedy (dolist (t g_other_tags) (replace t data g_sbm)) ;; When we see these strings we believe there's (replace {\n\n} data g_sbm) (replace {|} data g_sbm) (replace {::} data g_sbm) (dolist (t g_table_tags) (replace t data (append " " t))) (replace {<[^>]*>} data "" 0) (replace "{[^}]*}" data "" 0) (replace (trim g_sbm) data break_token) (set 'data (asciify data)) ;; This is a citation stripper - primarily for use with wikipedia (replace {\.\s*\[\d+\]} data "." 512) ; clean white space (replace "\\s+" data " " 0) ) (context 'MAIN) (context 'BOUNDARY) (set 'MIN_SENTENCE_LENGTH 9) (set 'MAX_SENTENCE_LENGTH 512) ;; This list contains abbreviations that are longer than 2 characters. (set 'g_punct_regex "[\\(\\[\\]\\)\\.]") (set 'g_break_token "") (define (GetSentences str , sentence_list word_list last_word i c final) ;; strip all double-quotes (replace {"} str "") (replace "\n" str " ") ;; We always break after an exclamation followed by a space. (replace "! " str (append "!" g_break_token)) ;; break upon ". " (set 'sentence_list (parse str ". ")) ;; break those pieces upon space (set 'word_list (filter if (map (fn(x) (parse x " ")) sentence_list))) (set 'i 0) (if (> (length word_list) 1) (while (< i (length word_list)) ;; Take the last word in a sentence and remove any of the characters in g_punct_regex (set 'last_word (replace g_punct_regex (last (word_list i)) "" 1)) (if ;; If the current sentence contains only one word which is not whitespace ;; we join it onto the prior sentence and replace the trimmed "." (and (= (length (word_list i)) 1) (not (find " " (word_list i))) ) (begin ; (println "Current : " (string (word_list i))) (set 'tmp (pop word_list i)) ; (println "Joining : " (string (word_list (- i 1))) " with " (string tmp)) (setf (word_list (- i 1) -1) (append (word_list (- i 1) -1) ".")) (setf (word_list (- i 1)) (append (word_list (- i 1)) tmp)) ) ;; Otherwise this is a valid sentence break and we move onto the next sentence. (inc i) ) ) ) ;; end if (set 'final '()) (dolist (w word_list) (push (append (join w " ") "." g_break_token) final -1) ) (set 'final (map (fn (z) (replace "\\.+\\z" z "." 0)) (map trim (flat (map (fn(x) (parse x g_break_token)) final))))) (filter (fn(x) (and (> (length x) MIN_SENTENCE_LENGTH) (<= (length x) MAX_SENTENCE_LENGTH))) final) ) (context 'MAIN) (define (url-to-sentences url timeout) (BOUNDARY:GetSentences (CLEAN:clean-html (get-url url timeout) BOUNDARY:g_break_token)) ) ;; eof