} data 1)) (if pos (set 'data (slice data 0 pos))) ;; strip javascript and html comments (replace {} data "" 513) (replace {} data "" 513) (replace {} data "" 513) ;; Sentence Boundary Helpers (dolist (t g_start_tags) (replace t data g_sbm 513)) ;; case insensitive + non-greedy (dolist (t g_heading_tags) (replace t data g_sbm)) (dolist (t g_break_tags) (replace t data g_sbm 513)) ;; case insensitive + non-greedy (dolist (t g_other_tags) (replace t data g_sbm)) ;; When we see these strings we believe there's (replace {\n\n} data g_sbm) (replace {|} data g_sbm) (replace {::} data g_sbm) (dolist (t g_table_tags) (replace t data (append " " t))) (replace {<[^>]*>} data "" 0) (replace "{[^}]*}" data "" 0) (replace (trim g_sbm) data break_token) (set 'data (asciify data)) ;; This is a citation stripper - primarily for use with wikipedia (replace {\.\s*\[\d+\]} data "." 512) ; clean white space (replace "\\s+" data " " 0) ) (context 'MAIN) (context 'BOUNDARY) (set 'MIN_SENTENCE_LENGTH 9) (set 'MAX_SENTENCE_LENGTH 512) ;; This list contains abbreviations that are longer than 2 characters. (set 'g_punct_regex "[\\(\\[\\]\\)\\.]") (set 'g_break_token "") (define (GetSentences str , sentence_list word_list last_word i c final) ;; strip all double-quotes (replace {"} str "") (replace "\n" str " ") ;; We always break after an exclamation followed by a space. (replace "! " str (append "!" g_break_token)) ;; break upon ". " (set 'sentence_list (parse str ". ")) ;; break those pieces upon space (set 'word_list (filter if (map (fn(x) (parse x " ")) sentence_list))) (set 'i 0) (if (> (length word_list) 1) (while (< i (length word_list)) ;; Take the last word in a sentence and remove any of the characters in g_punct_regex (set 'last_word (replace g_punct_regex (last (word_list i)) "" 1)) (if ;; If the current sentence contains only one word which is not whitespace ;; we join it onto the prior sentence and replace the trimmed "." (and (= (length (word_list i)) 1) (not (find " " (word_list i))) ) (begin ; (println "Current : " (string (word_list i))) (set 'tmp (pop word_list i)) ; (println "Joining : " (string (word_list (- i 1))) " with " (string tmp)) (setf (word_list (- i 1) -1) (append (word_list (- i 1) -1) ".")) (setf (word_list (- i 1)) (append (word_list (- i 1)) tmp)) ) ;; Otherwise this is a valid sentence break and we move onto the next sentence. (inc i) ) ) ) ;; end if (set 'final '()) (dolist (w word_list) (push (append (join w " ") "." g_break_token) final -1) ) (set 'final (map (fn (z) (replace "\\.+\\z" z "." 0)) (map trim (flat (map (fn(x) (parse x g_break_token)) final))))) (filter (fn(x) (and (> (length x) MIN_SENTENCE_LENGTH) (<= (length x) MAX_SENTENCE_LENGTH))) final) ) (context 'MAIN) (define (url-to-sentences url timeout) (BOUNDARY:GetSentences (CLEAN:clean-html (get-url url timeout) BOUNDARY:g_break_token)) ) ;; eof