Path: blob/master/src/applications/calendar/parser/ics/PhutilICSParser.php
12262 views
<?php12final class PhutilICSParser extends Phobject {34private $stack;5private $node;6private $document;7private $lines;8private $cursor;910private $warnings;1112const PARSE_MISSING_END = 'missing-end';13const PARSE_INITIAL_UNFOLD = 'initial-unfold';14const PARSE_UNEXPECTED_CHILD = 'unexpected-child';15const PARSE_EXTRA_END = 'extra-end';16const PARSE_MISMATCHED_SECTIONS = 'mismatched-sections';17const PARSE_ROOT_PROPERTY = 'root-property';18const PARSE_BAD_BASE64 = 'bad-base64';19const PARSE_BAD_BOOLEAN = 'bad-boolean';20const PARSE_UNEXPECTED_TEXT = 'unexpected-text';21const PARSE_MALFORMED_DOUBLE_QUOTE = 'malformed-double-quote';22const PARSE_MALFORMED_PARAMETER_NAME = 'malformed-parameter';23const PARSE_MALFORMED_PROPERTY = 'malformed-property';24const PARSE_MISSING_VALUE = 'missing-value';25const PARSE_UNESCAPED_BACKSLASH = 'unescaped-backslash';26const PARSE_MULTIPLE_PARAMETERS = 'multiple-parameters';27const PARSE_EMPTY_DATETIME = 'empty-datetime';28const PARSE_MANY_DATETIME = 'many-datetime';29const PARSE_BAD_DATETIME = 'bad-datetime';30const PARSE_EMPTY_DURATION = 'empty-duration';31const PARSE_MANY_DURATION = 'many-duration';32const PARSE_BAD_DURATION = 'bad-duration';3334const WARN_TZID_UTC = 'warn-tzid-utc';35const WARN_TZID_GUESS = 'warn-tzid-guess';36const WARN_TZID_IGNORED = 'warn-tzid-ignored';3738public function parseICSData($data) {39$this->stack = array();40$this->node = null;41$this->cursor = null;42$this->warnings = array();4344$lines = $this->unfoldICSLines($data);45$this->lines = $lines;4647$root = $this->newICSNode('<ROOT>');48$this->stack[] = $root;49$this->node = $root;5051foreach ($lines as $key => $line) {52$this->cursor = $key;53$matches = null;54if (preg_match('(^BEGIN:(.*)\z)', $line, $matches)) {55$this->beginParsingNode($matches[1]);56} else if (preg_match('(^END:(.*)\z)', $line, $matches)) {57$this->endParsingNode($matches[1]);58} else {59if (count($this->stack) < 2) {60$this->raiseParseFailure(61self::PARSE_ROOT_PROPERTY,62pht(63'Found unexpected property at ICS document root.'));64}65$this->parseICSProperty($line);66}67}6869if (count($this->stack) > 1) {70$this->raiseParseFailure(71self::PARSE_MISSING_END,72pht(73'Expected all "BEGIN:" sections in ICS document to have '.74'corresponding "END:" sections.'));75}7677$this->node = null;78$this->lines = null;79$this->cursor = null;8081return $root;82}8384private function getNode() {85return $this->node;86}8788private function unfoldICSLines($data) {89$lines = phutil_split_lines($data, $retain_endings = false);90$this->lines = $lines;9192// ICS files are wrapped at 75 characters, with overlong lines continued93// on the following line with an initial space or tab. Unwrap all of the94// lines in the file.9596// This unwrapping is specifically byte-oriented, not character oriented,97// and RFC5545 anticipates that simple implementations may even split UTF898// characters in the middle.99100$last = null;101foreach ($lines as $idx => $line) {102$this->cursor = $idx;103if (!preg_match('/^[ \t]/', $line)) {104$last = $idx;105continue;106}107108if ($last === null) {109$this->raiseParseFailure(110self::PARSE_INITIAL_UNFOLD,111pht(112'First line of ICS file begins with a space or tab, but this '.113'marks a line which should be unfolded.'));114}115116$lines[$last] = $lines[$last].substr($line, 1);117unset($lines[$idx]);118}119120return $lines;121}122123private function beginParsingNode($type) {124$node = $this->getNode();125$new_node = $this->newICSNode($type);126127if ($node instanceof PhutilCalendarContainerNode) {128$node->appendChild($new_node);129} else {130$this->raiseParseFailure(131self::PARSE_UNEXPECTED_CHILD,132pht(133'Found unexpected node "%s" inside node "%s".',134$new_node->getAttribute('ics.type'),135$node->getAttribute('ics.type')));136}137138$this->stack[] = $new_node;139$this->node = $new_node;140141return $this;142}143144private function newICSNode($type) {145switch ($type) {146case '<ROOT>':147$node = new PhutilCalendarRootNode();148break;149case 'VCALENDAR':150$node = new PhutilCalendarDocumentNode();151break;152case 'VEVENT':153$node = new PhutilCalendarEventNode();154break;155default:156$node = new PhutilCalendarRawNode();157break;158}159160$node->setAttribute('ics.type', $type);161162return $node;163}164165private function endParsingNode($type) {166$node = $this->getNode();167if ($node instanceof PhutilCalendarRootNode) {168$this->raiseParseFailure(169self::PARSE_EXTRA_END,170pht(171'Found unexpected "END" without a "BEGIN".'));172}173174$old_type = $node->getAttribute('ics.type');175if ($old_type != $type) {176$this->raiseParseFailure(177self::PARSE_MISMATCHED_SECTIONS,178pht(179'Found mismatched "BEGIN" ("%s") and "END" ("%s") sections.',180$old_type,181$type));182}183184array_pop($this->stack);185$this->node = last($this->stack);186187return $this;188}189190private function parseICSProperty($line) {191$matches = null;192193// Properties begin with an alphanumeric name with no escaping, followed194// by either a ";" (to begin a list of parameters) or a ":" (to begin195// the actual field body).196197$ok = preg_match('(^([A-Za-z0-9-]+)([;:])(.*)\z)', $line, $matches);198if (!$ok) {199$this->raiseParseFailure(200self::PARSE_MALFORMED_PROPERTY,201pht(202'Found malformed property in ICS document.'));203}204205$name = $matches[1];206$body = $matches[3];207$has_parameters = ($matches[2] == ';');208209$parameters = array();210if ($has_parameters) {211// Parameters are a sensible name, a literal "=", a pile of magic,212// and then maybe a comma and another parameter.213214while (true) {215// We're going to get the first couple of parts first.216$ok = preg_match('(^([^=]+)=)', $body, $matches);217if (!$ok) {218$this->raiseParseFailure(219self::PARSE_MALFORMED_PARAMETER_NAME,220pht(221'Found malformed property in ICS document: %s',222$body));223}224225$param_name = $matches[1];226$body = substr($body, strlen($matches[0]));227228// Now we're going to match zero or more values.229$param_values = array();230while (true) {231// The value can either be a double-quoted string or an unquoted232// string, with some characters forbidden.233if (strlen($body) && $body[0] == '"') {234$is_quoted = true;235$ok = preg_match(236'(^"([^\x00-\x08\x10-\x19"]*)")',237$body,238$matches);239if (!$ok) {240$this->raiseParseFailure(241self::PARSE_MALFORMED_DOUBLE_QUOTE,242pht(243'Found malformed double-quoted string in ICS document '.244'parameter value.'));245}246} else {247$is_quoted = false;248249// It's impossible for this not to match since it can match250// nothing, and it's valid for it to match nothing.251preg_match('(^([^\x00-\x08\x10-\x19";:,]*))', $body, $matches);252}253254// NOTE: RFC5545 says "Property parameter values that are not in255// quoted-strings are case-insensitive." -- that is, the quoted and256// unquoted representations are not equivalent. Thus, preserve the257// original formatting in case we ever need to respect this.258259$param_values[] = array(260'value' => $this->unescapeParameterValue($matches[1]),261'quoted' => $is_quoted,262);263264$body = substr($body, strlen($matches[0]));265if (!strlen($body)) {266$this->raiseParseFailure(267self::PARSE_MISSING_VALUE,268pht(269'Expected ":" after parameters in ICS document property.'));270}271272// If we have a comma now, we're going to read another value. Strip273// it off and keep going.274if ($body[0] == ',') {275$body = substr($body, 1);276continue;277}278279// If we have a semicolon, we're going to read another parameter.280if ($body[0] == ';') {281break;282}283284// If we have a colon, this is the last value and also the last285// property. Break, then handle the colon below.286if ($body[0] == ':') {287break;288}289290$short_body = id(new PhutilUTF8StringTruncator())291->setMaximumGlyphs(32)292->truncateString($body);293294// We aren't expecting anything else.295$this->raiseParseFailure(296self::PARSE_UNEXPECTED_TEXT,297pht(298'Found unexpected text ("%s") after reading parameter value.',299$short_body));300}301302$parameters[] = array(303'name' => $param_name,304'values' => $param_values,305);306307if ($body[0] == ';') {308$body = substr($body, 1);309continue;310}311312if ($body[0] == ':') {313$body = substr($body, 1);314break;315}316}317}318319$value = $this->unescapeFieldValue($name, $parameters, $body);320321$node = $this->getNode();322323324$raw = $node->getAttribute('ics.properties', array());325$raw[] = array(326'name' => $name,327'parameters' => $parameters,328'value' => $value,329);330$node->setAttribute('ics.properties', $raw);331332switch ($node->getAttribute('ics.type')) {333case 'VEVENT':334$this->didParseEventProperty($node, $name, $parameters, $value);335break;336}337}338339private function unescapeParameterValue($data) {340// The parameter grammar is adjusted by RFC6868 to permit escaping with341// carets. Remove that escaping.342343// This escaping is a bit weird because it's trying to be backwards344// compatible and the original spec didn't think about this and didn't345// provide much room to fix things.346347$out = '';348$esc = false;349foreach (phutil_utf8v($data) as $c) {350if (!$esc) {351if ($c != '^') {352$out .= $c;353} else {354$esc = true;355}356} else {357switch ($c) {358case 'n':359$out .= "\n";360break;361case '^':362$out .= '^';363break;364case "'":365// NOTE: This is "<caret> <single quote>" being decoded into a366// double quote!367$out .= '"';368break;369default:370// NOTE: The caret is NOT an escape for any other characters.371// This is a "MUST" requirement of RFC6868.372$out .= '^'.$c;373break;374}375}376}377378// NOTE: Because caret on its own just means "caret" for backward379// compatibility, we don't warn if we're still in escaped mode once we380// reach the end of the string.381382return $out;383}384385private function unescapeFieldValue($name, array $parameters, $data) {386// NOTE: The encoding of the field value data is dependent on the field387// name (which defines a default encoding) and the parameters (which may388// include "VALUE", specifying a type of the data.389390$default_types = array(391'CALSCALE' => 'TEXT',392'METHOD' => 'TEXT',393'PRODID' => 'TEXT',394'VERSION' => 'TEXT',395396'ATTACH' => 'URI',397'CATEGORIES' => 'TEXT',398'CLASS' => 'TEXT',399'COMMENT' => 'TEXT',400'DESCRIPTION' => 'TEXT',401402// TODO: The spec appears to contradict itself: it says that the value403// type is FLOAT, but it also says that this property value is actually404// two semicolon-separated values, which is not what FLOAT is defined as.405'GEO' => 'TEXT',406407'LOCATION' => 'TEXT',408'PERCENT-COMPLETE' => 'INTEGER',409'PRIORITY' => 'INTEGER',410'RESOURCES' => 'TEXT',411'STATUS' => 'TEXT',412'SUMMARY' => 'TEXT',413414'COMPLETED' => 'DATE-TIME',415'DTEND' => 'DATE-TIME',416'DUE' => 'DATE-TIME',417'DTSTART' => 'DATE-TIME',418'DURATION' => 'DURATION',419'FREEBUSY' => 'PERIOD',420'TRANSP' => 'TEXT',421422'TZID' => 'TEXT',423'TZNAME' => 'TEXT',424'TZOFFSETFROM' => 'UTC-OFFSET',425'TZOFFSETTO' => 'UTC-OFFSET',426'TZURL' => 'URI',427428'ATTENDEE' => 'CAL-ADDRESS',429'CONTACT' => 'TEXT',430'ORGANIZER' => 'CAL-ADDRESS',431'RECURRENCE-ID' => 'DATE-TIME',432'RELATED-TO' => 'TEXT',433'URL' => 'URI',434'UID' => 'TEXT',435'EXDATE' => 'DATE-TIME',436'RDATE' => 'DATE-TIME',437'RRULE' => 'RECUR',438439'ACTION' => 'TEXT',440'REPEAT' => 'INTEGER',441'TRIGGER' => 'DURATION',442443'CREATED' => 'DATE-TIME',444'DTSTAMP' => 'DATE-TIME',445'LAST-MODIFIED' => 'DATE-TIME',446'SEQUENCE' => 'INTEGER',447448'REQUEST-STATUS' => 'TEXT',449);450451$value_type = idx($default_types, $name, 'TEXT');452453foreach ($parameters as $parameter) {454if ($parameter['name'] == 'VALUE') {455$value_type = idx(head($parameter['values']), 'value');456}457}458459switch ($value_type) {460case 'BINARY':461$result = base64_decode($data, true);462if ($result === false) {463$this->raiseParseFailure(464self::PARSE_BAD_BASE64,465pht(466'Unable to decode base64 data: %s',467$data));468}469break;470case 'BOOLEAN':471$map = array(472'true' => true,473'false' => false,474);475$result = phutil_utf8_strtolower($data);476if (!isset($map[$result])) {477$this->raiseParseFailure(478self::PARSE_BAD_BOOLEAN,479pht(480'Unexpected BOOLEAN value "%s".',481$data));482}483$result = $map[$result];484break;485case 'CAL-ADDRESS':486$result = $data;487break;488case 'DATE':489// This is a comma-separated list of "YYYYMMDD" values.490$result = explode(',', $data);491break;492case 'DATE-TIME':493if (!strlen($data)) {494$result = array();495} else {496$result = explode(',', $data);497}498break;499case 'DURATION':500if (!strlen($data)) {501$result = array();502} else {503$result = explode(',', $data);504}505break;506case 'FLOAT':507$result = explode(',', $data);508foreach ($result as $k => $v) {509$result[$k] = (float)$v;510}511break;512case 'INTEGER':513$result = explode(',', $data);514foreach ($result as $k => $v) {515$result[$k] = (int)$v;516}517break;518case 'PERIOD':519$result = explode(',', $data);520break;521case 'RECUR':522$result = $data;523break;524case 'TEXT':525$result = $this->unescapeTextValue($data);526break;527case 'TIME':528$result = explode(',', $data);529break;530case 'URI':531$result = $data;532break;533case 'UTC-OFFSET':534$result = $data;535break;536default:537// RFC5545 says we MUST preserve the data for any types we don't538// recognize.539$result = $data;540break;541}542543return array(544'type' => $value_type,545'value' => $result,546'raw' => $data,547);548}549550private function unescapeTextValue($data) {551$result = array();552553$buf = '';554$esc = false;555foreach (phutil_utf8v($data) as $c) {556if (!$esc) {557if ($c == '\\') {558$esc = true;559} else if ($c == ',') {560$result[] = $buf;561$buf = '';562} else {563$buf .= $c;564}565} else {566switch ($c) {567case 'n':568case 'N':569$buf .= "\n";570break;571default:572$buf .= $c;573break;574}575$esc = false;576}577}578579if ($esc) {580$this->raiseParseFailure(581self::PARSE_UNESCAPED_BACKSLASH,582pht(583'ICS document contains TEXT value ending with unescaped '.584'backslash.'));585}586587$result[] = $buf;588589return $result;590}591592private function raiseParseFailure($code, $message) {593if ($this->lines && isset($this->lines[$this->cursor])) {594$message = pht(595"ICS Parse Error near line %s:\n\n>>> %s\n\n%s",596$this->cursor + 1,597$this->lines[$this->cursor],598$message);599} else {600$message = pht(601'ICS Parse Error: %s',602$message);603}604605throw id(new PhutilICSParserException($message))606->setParserFailureCode($code);607}608609private function raiseWarning($code, $message) {610$this->warnings[] = array(611'code' => $code,612'line' => $this->cursor,613'text' => $this->lines[$this->cursor],614'message' => $message,615);616617return $this;618}619620public function getWarnings() {621return $this->warnings;622}623624private function didParseEventProperty(625PhutilCalendarEventNode $node,626$name,627array $parameters,628array $value) {629630switch ($name) {631case 'UID':632$text = $this->newTextFromProperty($parameters, $value);633$node->setUID($text);634break;635case 'CREATED':636$datetime = $this->newDateTimeFromProperty($parameters, $value);637$node->setCreatedDateTime($datetime);638break;639case 'DTSTAMP':640$datetime = $this->newDateTimeFromProperty($parameters, $value);641$node->setModifiedDateTime($datetime);642break;643case 'SUMMARY':644$text = $this->newTextFromProperty($parameters, $value);645$node->setName($text);646break;647case 'DESCRIPTION':648$text = $this->newTextFromProperty($parameters, $value);649$node->setDescription($text);650break;651case 'DTSTART':652$datetime = $this->newDateTimeFromProperty($parameters, $value);653$node->setStartDateTime($datetime);654break;655case 'DTEND':656$datetime = $this->newDateTimeFromProperty($parameters, $value);657$node->setEndDateTime($datetime);658break;659case 'DURATION':660$duration = $this->newDurationFromProperty($parameters, $value);661$node->setDuration($duration);662break;663case 'RRULE':664$rrule = $this->newRecurrenceRuleFromProperty($parameters, $value);665$node->setRecurrenceRule($rrule);666break;667case 'RECURRENCE-ID':668$text = $this->newTextFromProperty($parameters, $value);669$node->setRecurrenceID($text);670break;671case 'ATTENDEE':672$attendee = $this->newAttendeeFromProperty($parameters, $value);673$node->addAttendee($attendee);674break;675}676677}678679private function newTextFromProperty(array $parameters, array $value) {680$value = $value['value'];681return implode("\n\n", $value);682}683684private function newAttendeeFromProperty(array $parameters, array $value) {685$uri = $value['value'];686687switch (idx($parameters, 'PARTSTAT')) {688case 'ACCEPTED':689$status = PhutilCalendarUserNode::STATUS_ACCEPTED;690break;691case 'DECLINED':692$status = PhutilCalendarUserNode::STATUS_DECLINED;693break;694case 'NEEDS-ACTION':695default:696$status = PhutilCalendarUserNode::STATUS_INVITED;697break;698}699700$name = $this->getScalarParameterValue($parameters, 'CN');701702return id(new PhutilCalendarUserNode())703->setURI($uri)704->setName($name)705->setStatus($status);706}707708private function newDateTimeFromProperty(array $parameters, array $value) {709$value = $value['value'];710711if (!$value) {712$this->raiseParseFailure(713self::PARSE_EMPTY_DATETIME,714pht(715'Expected DATE-TIME to have exactly one value, found none.'));716717}718719if (count($value) > 1) {720$this->raiseParseFailure(721self::PARSE_MANY_DATETIME,722pht(723'Expected DATE-TIME to have exactly one value, found more than '.724'one.'));725}726727$value = head($value);728$tzid = $this->getScalarParameterValue($parameters, 'TZID');729730if (preg_match('/Z\z/', $value)) {731if ($tzid) {732$this->raiseWarning(733self::WARN_TZID_UTC,734pht(735'DATE-TIME "%s" uses "Z" to specify UTC, but also has a TZID '.736'parameter with value "%s". This violates RFC5545. The TZID '.737'will be ignored, and the value will be interpreted as UTC.',738$value,739$tzid));740}741$tzid = 'UTC';742} else if ($tzid !== null) {743$tzid = $this->guessTimezone($tzid);744}745746try {747$datetime = PhutilCalendarAbsoluteDateTime::newFromISO8601(748$value,749$tzid);750} catch (Exception $ex) {751$this->raiseParseFailure(752self::PARSE_BAD_DATETIME,753pht(754'Error parsing DATE-TIME: %s',755$ex->getMessage()));756}757758return $datetime;759}760761private function newDurationFromProperty(array $parameters, array $value) {762$value = $value['value'];763764if (!$value) {765$this->raiseParseFailure(766self::PARSE_EMPTY_DURATION,767pht(768'Expected DURATION to have exactly one value, found none.'));769770}771772if (count($value) > 1) {773$this->raiseParseFailure(774self::PARSE_MANY_DURATION,775pht(776'Expected DURATION to have exactly one value, found more than '.777'one.'));778}779780$value = head($value);781782try {783$duration = PhutilCalendarDuration::newFromISO8601($value);784} catch (Exception $ex) {785$this->raiseParseFailure(786self::PARSE_BAD_DURATION,787pht(788'Invalid DURATION: %s',789$ex->getMessage()));790}791792return $duration;793}794795private function newRecurrenceRuleFromProperty(array $parameters, $value) {796return PhutilCalendarRecurrenceRule::newFromRRULE($value['value']);797}798799private function getScalarParameterValue(800array $parameters,801$name,802$default = null) {803804$match = null;805foreach ($parameters as $parameter) {806if ($parameter['name'] == $name) {807$match = $parameter;808}809}810811if ($match === null) {812return $default;813}814815$value = $match['values'];816if (!$value) {817// Parameter is specified, but with no value, like "KEY=". Just return818// the default, as though the parameter was not specified.819return $default;820}821822if (count($value) > 1) {823$this->raiseParseFailure(824self::PARSE_MULTIPLE_PARAMETERS,825pht(826'Expected parameter "%s" to have at most one value, but found '.827'more than one.',828$name));829}830831return idx(head($value), 'value');832}833834private function guessTimezone($tzid) {835$map = DateTimeZone::listIdentifiers();836$map = array_fuse($map);837if (isset($map[$tzid])) {838// This is a real timezone we recognize, so just use it as provided.839return $tzid;840}841842// These are alternate names for timezones.843static $aliases;844845if ($aliases === null) {846$aliases = array(847'Etc/GMT' => 'UTC',848);849850// Load the map of Windows timezones.851$root_path = dirname(phutil_get_library_root('phabricator'));852$windows_path = $root_path.'/resources/timezones/windows-timezones.json';853$windows_data = Filesystem::readFile($windows_path);854$windows_zones = phutil_json_decode($windows_data);855856$aliases = $aliases + $windows_zones;857}858859if (isset($aliases[$tzid])) {860return $aliases[$tzid];861}862863// Look for something that looks like "UTC+3" or "GMT -05.00". If we find864// anything, pick a timezone with that offset.865$offset_pattern =866'/'.867'(?:UTC|GMT)'.868'\s*'.869'(?P<sign>[+-])'.870'\s*'.871'(?P<h>\d+)'.872'(?:'.873'[:.](?P<m>\d+)'.874')?'.875'/i';876877$matches = null;878if (preg_match($offset_pattern, $tzid, $matches)) {879$hours = (int)$matches['h'];880$minutes = (int)idx($matches, 'm');881$offset = ($hours * 60 * 60) + ($minutes * 60);882883if (idx($matches, 'sign') == '-') {884$offset = -$offset;885}886887// NOTE: We could possibly do better than this, by using the event start888// time to guess a timezone. However, that won't work for recurring889// events and would require us to do this work after finishing initial890// parsing. Since these unusual offset-based timezones appear to be rare,891// the benefit may not be worth the complexity.892$now = new DateTime('@'.time());893894foreach ($map as $identifier) {895$zone = new DateTimeZone($identifier);896if ($zone->getOffset($now) == $offset) {897$this->raiseWarning(898self::WARN_TZID_GUESS,899pht(900'TZID "%s" is unknown, guessing "%s" based on pattern "%s".',901$tzid,902$identifier,903$matches[0]));904return $identifier;905}906}907}908909$this->raiseWarning(910self::WARN_TZID_IGNORED,911pht(912'TZID "%s" is unknown, using UTC instead.',913$tzid));914915return 'UTC';916}917918}919920921