# First, let's define some basic sets of words (with common abbreviations) $days = 'Sun(day|\.)?| # All of these may be abreviations, Mon(day|\.)?| # with or without a period Tues?(day|\.)?| # Accept Tue or Tues Wed(nesday|\.)?| Thurs?(day|\.)?| # Accept Thur or Thurs Fri(day|\.)?| Sat(urday|\.)?'; $months = 'Jan(uary|\.)?| Feb(ruary|\.)?| Mar(ch|\.?)| Apr(il|\.)?| May| Jun(e|\.)?| Jul\.| Aug(ust|\.)?| Sept?(ember|\.)?| # Accept Sep or Sept Oct(ober|\.)?| Nov(ember|\.)?| Dec(ember|\.)?'; $time_units = 'sec(onds?|\.)?| min(utes?|\.)| hours?|hrs?.?| # Accept hr or hrs days?| weeks?| months?| years?'; # An admittedly incomplete list of "special" holidays $holiday1 = 'Christmas|Easter|Thanksgiving|Halloween| Ramadan|Mardi Gras|Passover|Kwanza|Chanukah'; # $holiday2 includes holidays like "Mother's Day" and "St. Patrick's Day". # This figures out how many words are part of the holiday name based on # capitalization. Thus, it won't work quite right on a phrase like # "It's Groundhog's Day" because it will include "It's" within the TIME tag. $holiday2 = '([A-Z][\w\'\.]*\s*)* # Mutliple copies of capitalized "words" (Day|Eve)'; # A trailing (capitalized) "Day" or "Eve" # tells us it's probably a holiday # All of the basic sets above $basics = "$days|$months|$time_units|$holiday1|$holiday2"; # Various numerical forms which will be used below $one_to_9 = '[Oo]ne|[Tt]wo|[Tt]hree|[Ff]our| [Ff]ive|[Ss]ix|[Ss]even|[Ee]ight|[Nn]ine'; $ten_to_12 = '[Tt]en|[Ee]leven|[Tt]welve'; $thirteen_to_19 = '[Tt]hirteen|[Ff]ourteen|[Ff]ifteen| [Ss]ixteen|[Ss]eventeen|[Ee]ighteen|[Nn]ineteen'; $twenty_to_31 = "[Tt]wenty(-($one_to_9))?|[Tt]hirty(-one)?"; # For months where we only expect numbers 1 to 31 # Inlcudes both numeric and alphabetic number forms $all_num = "$twenty_to_31|$one_to_9|$ten_to_12| $thirteen_to_19|[12][0-9]|3[01]|[1-9]"; # Sometimes we need more than just 31 numbers $tens = '[Tt]wenty|[Tt]hirty|[Ff]ourty|[Ff]ifty| [Ss]ixty|[Ss]eventy|[Ee]ighty|[Nn]inty'; $full_num = "(($one_to_9)\\s* # (a number from one and nine [Hh]undred\\s* # hundred (and\\s*)?)? # (and) -- optional) -- optional ($one_to_9| # any number from one to ninety-nine $ten_to_12| $thirteen_to_19| ($tens(-($one_to_9))?))"; # "First", "Second", "Third" forms needed for "first of October" $first_to_ninth = '[Ff]irst|[Ss]econd|[Tt]hird|[Ff]ourth| [Ff]ifth|[Ss]ixth|[Ss]eventh|[Ee]ighth|[Nn]inth'; $tenth_to_19th = '[Tt]enth|[Ee]leventh|[Tt]welvth|[Tt]hirteenth| [Ff]ourteenth|[Ff]ifteenth|[Ss]ixteenth| [Ss]eventeenth|[Ee]ighteenth|[Nn]inteenth'; $twentieth_to_31st = "[Tt]went((y-($first_to_ninth))|ieth)|[Tt]hirt((y-first)|ieth)"; # Numeric 1st, 2nd, 3rd forms $numeric_th = '1st|2nd|3rd|20th|21st|22nd|23rd|30th|31st|[4-9]th|1[0-9]th|2[4-9th]'; # All of the above 1st/first forms $all_th = "$twentieth_to_31st|$first_to_ninth|$tenth_to_19th|$numeric_th"; # Standard numeric forms for the year, from 1000-2999. # Unless we're dealing with historical texts, any number outside of these ranges is likely # to not be a number. $years = '[12][0-9]{3}'; # Ditto for decades, 1000's through 2990's (includes both "1990s" and "1990's") $decades = '[12][0-p]0'; # I've NOT included alphabetic year forms (i.e., nineteen hundred ninety six) # Some common date forms $date1 = "(($days),?\\s*)?($months)\\s* # March 5th, 1922 (year optional) ($all_th|$all_num)(,\\s*$years)?"; $date2 = "(($days),?\\s* )?($all_th)\\s* # 5th of March, 1922 (year optional) of\\s*($months)(,\\s*$years)?"; # Numeric date forms for 1900s and 2000s (such as 11/29/01 or 29-11-2001) $date3 = "([12][0-9]|3[01]|[1-9])- # Day-month-year ([1-9]|1[0-2]) -(19|20)?[0-9]{2}"; $date4 = "([1-9]|1[0-2])\/ # Month/day/year ([12][0-9]|3[01]|[1-9]) \/(19|20)?[0-9]{2}"; $date5 = "([1-9]|1[0-2])- # Month-day-year ([12][0-9]|3[01]|[1-9]) -(19|20)?[0-9]{2}"; $date6 = "([12][0-9]|3[01]|[1-9])\/ # Day/month/year ([1-9]|1[0-2]) \/(19|20)?[0-9]{2}"; # Centuries, from the 1st to the 31st $century = "($all_th)\\s*[Cc]entury"; # All of the various date and year type forms given above $all_dates = "$date1|$date2|$date3|$date4|$date5|$date6|$century|$decades|$years"; # Time-of-day forms. I have not included international/military 24 hour clock times. $vague_times = '[Mm]orning|[Nn]oon|[Aa]fternoon|[Ee]vening|([Tt]on|[Nn]|[Mm]idn)ight'; $o_clock = "($one_to_9|$ten_to_12|[1-9]|1[0-2])\\s* # six o'clock (([Oo]\'\\s*clock)| # or 2am, 8 pm ([Aa]|[Pp])[Mm])"; $half_past = "[Hh]alf\\s*past\\s* # half past eight ($one_to_9|$ten_to_12|[1-9]|1[0-2])"; $quarter_form = "([Aa]\\s*)?[Qq]uarter\\s* # (a) quarter to/after/past three (to|after|past)\\s* ($one_to_9|$ten_to_12|[1-9]|1[0-2])"; $colon_form = "([1-9]|1[0-2]): # standard 6:45am form [0-5][0-9](\\s*)?(([Aa]|[Pp])[Mm])?"; # All of the above time forms $all_times = "$vague_times|$o_clock|$half_past|$quarter_form|$colon_form"; # Deictic expressions $basic_deictic = '[Tt]oday|[Yy]esterday|[Tt]omorrow'; $special_deictic = '[Tt]he\s*day\s* # recognizes "the day before yesterday" (before\s*yesterday|after\s*tomorrow)'; # and "the day after tomorrow" $n_from_deictic = "($full_num|a|[0-9]*)\\s* # six days from Sunday ($time_units)\\s*from\\s* # or six minutes from now, etc. ($days|$basic_deictic|now)"; $ago_deictic = "($full_num|[0-9]*)\\s* # Twelve days ago/hence/earlier etc. ($time_units)\\s* (ago|hence|before|after|later|earlier)"; $in_deictic = "([Ww]ithin|[Ii]n| # In/within/after/before eight minutes [Aa]fter|[Bb]efore)\\s* ($full_num|[0-9]*)\\s*($time_units)"; $next_deictic = "([Nn]ext)\\s* ($days|week|month|year)"; # All of the previously defined deictic forms $all_deictic = "$basic_deictic|$special_deictic| $n_from_deictic|$ago_deictic|$next_deictic|$in_deictic";