#!/usr/bin/newlisp ;; Creative Commons Attribution (by) License v2.5 ;; Full text - http://creativecommons.org/licenses/by/2.5/ ;; Contact - gordon.fischer@gmail.com, desanto@mac.com ;; Copyright (c) 2006, kozoru, Inc. ;; Updated to newLISP v.10.0, L.M. May 2009 ;; SentenceBoundary.lsp ;; $Rev: 2701 $ ;; - Uses contexts CLEAN and BOUNDARY ;; - Supplies the following function in MAIN ;; (url-to-sentences "http://en.wikipedia.org/wiki/Grady_(band)" 15000) ;; (url-to-sentences "http://www.bbc.co.uk/drama/spooks/series4_ep10.shtml" 10000) ;; - (CLEAN:clean-html str token) ;; If token is not nil, token will be inserted at every guaranteed sentence ;; break (ie. after
etc) ;; - (BOUNDARY:GetSentences str) ;; Breaks str on sentence boundaries. Can be used with CLEAN:clean-html to ;; leverage HTML markup to ensure sentence breaks on known HTML characters. ;; (eg. (CLEAN:clean-html str BOUNDARY:g_break_token) ) (context 'CLEAN) ;; ;; Global Configuration ;; ;; The string that we use to insert upon a known sentence boundary mark (SBM). ;; We include a space after the marker to ensure proper word separation. (set 'g_sbm "SENTBRK ") (set 'g_start_tags (list "" "} {} )) ;; will use newlisp replace (set 'g_other_tags (list {} {} {" "
" "
} {} {} {} {} {} {} {} {} {} {