!import
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
1 /* -*- Mode: Java; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2 * ***** BEGIN LICENSE BLOCK *****
3 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4 *
5 * The contents of this file are subject to the Mozilla Public License Version
6 * 1.1 (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 * http://www.mozilla.org/MPL/
9 *
10 * Software distributed under the License is distributed on an "AS IS" basis,
11 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12 * for the specific language governing rights and limitations under the
13 * License.
14 *
15 * The Original Code is the RSS Parsing Engine
16 *
17 * The Initial Developer of the Original Code is
18 * The Mozilla Foundation.
19 * Portions created by the Initial Developer are Copyright (C) 2004
20 * the Initial Developer. All Rights Reserved.
21 *
22 * Contributor(s):
23 *
24 * Alternatively, the contents of this file may be used under the terms of
25 * either the GNU General Public License Version 2 or later (the "GPL"), or
26 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27 * in which case the provisions of the GPL or the LGPL are applicable instead
28 * of those above. If you wish to allow use of your version of this file only
29 * under the terms of either the GPL or the LGPL, and not to allow others to
30 * use your version of this file under the terms of the MPL, indicate your
31 * decision by deleting the provisions above and replace them with the notice
32 * and other provisions required by the GPL or the LGPL. If you do not delete
33 * the provisions above, a recipient may use your version of this file under
34 * the terms of any one of the MPL, the GPL or the
35 * ***** END LICENSE BLOCK ***** */
36
37 // The feed parser depends on FeedItems.js, Feed.js.
38
39 var rdfcontainer = Components.classes["@mozilla.org/rdf/container-utils;1"].getService(Components.interfaces.nsIRDFContainerUtils);
40 var rdfparser = Components.classes["@mozilla.org/rdf/xml-parser;1"].createInstance(Components.interfaces.nsIRDFXMLParser);
41 var serializer = Components.classes["@mozilla.org/xmlextras/xmlserializer;1"].createInstance(Components.interfaces.nsIDOMSerializer);
42
FeedParser
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
43 function FeedParser()
44 {}
45
46 FeedParser.prototype =
47 {
48 // parseFeed returns an array of parsed items ready for processing
49 // it is currently a synchronous operation. If there was an error parsing the feed,
50 // parseFeed returns an empty feed in addition to calling aFeed.onParseError
parseFeed
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
51 parseFeed: function (aFeed, aSource, aDOM, aBaseURI)
52 {
53 if (!aSource || !(aDOM instanceof Components.interfaces.nsIDOMXMLDocument))
54 {
55 aFeed.onParseError(aFeed);
56 return new Array();
57 }
58 else if((aDOM.documentElement.namespaceURI == "http://www.w3.org/1999/02/22-rdf-syntax-ns#")
59 && (aDOM.documentElement.getElementsByTagNameNS("http://purl.org/rss/1.0/", "channel")[0]))
60 {
61 debug(aFeed.url + " is an RSS 1.x (RDF-based) feed");
62 // aSource can be misencoded (XMLHttpRequest converts to UTF-8 by default),
63 // but the DOM is almost always right because it uses the hints in the XML file.
64 // This is slower, but not noticably so. Mozilla doesn't have the
65 // XMLHttpRequest.responseBody property that IE has, which provides access
66 // to the unencoded response.
67 var xmlString=serializer.serializeToString(aDOM.documentElement);
68 return this.parseAsRSS1(aFeed, xmlString, aBaseURI);
69 }
70 else if (aDOM.documentElement.namespaceURI == ATOM_03_NS)
71 {
72 debug(aFeed.url + " is an Atom 0.3 feed");
73 return this.parseAsAtom(aFeed, aDOM);
74 }
75 else if (aDOM.documentElement.namespaceURI == ATOM_IETF_NS)
76 {
77 debug(aFeed.url + " is an IETF Atom feed");
78 return this.parseAsAtomIETF(aFeed, aDOM);
79 }
80 else if (aSource.search(/"http:\/\/my\.netscape\.com\/rdf\/simple\/0\.9\/"/) != -1)
81 {
82 debug(aFeed.url + " is an 0.90 feed");
83 return this.parseAsRSS2(aFeed, aDOM);
84 }
85 // XXX Explicitly check for RSS 2.0 instead of letting it be handled by the
86 // default behavior (who knows, we may change the default at some point).
87 else
88 {
89 // We don't know what kind of feed this is; let's pretend it's RSS 0.9x
90 // and hope things work out for the best. In theory even RSS 1.0 feeds
91 // could be parsed by the 0.9x parser if the RSS namespace was the default.
92 debug(aFeed.url + " is of unknown format; assuming an RSS 0.9x feed");
93 return this.parseAsRSS2(aFeed, aDOM);
94 }
95 },
96
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
97 parseAsRSS2: function (aFeed, aDOM)
98 {
99 // Get the first channel (assuming there is only one per RSS File).
100 var parsedItems = new Array();
101
102 var channel = aDOM.getElementsByTagName("channel")[0];
103 if (!channel)
104 return aFeed.onParseError(aFeed);
105
106 //usually the empty string, unless this is RSS .90
107 var nsURI = channel.namespaceURI || "";
108 debug("channel NS: '" + nsURI +"'");
109
110 aFeed.title = aFeed.title || getNodeValue(this.childrenByTagNameNS(channel, nsURI, "title")[0]);
111 aFeed.description = getNodeValue(this.childrenByTagNameNS(channel, nsURI, "description")[0]);
112 aFeed.link = getNodeValue(this.childrenByTagNameNS(channel, nsURI, "link")[0]);
113
114 if (!aFeed.parseItems)
115 return parsedItems;
116
117 aFeed.invalidateItems();
118 // XXX use getElementsByTagNameNS for now
119 // childrenByTagNameNS would be better, but RSS .90 is still with us
120 var itemNodes = aDOM.getElementsByTagNameNS(nsURI,"item");
121
122 for (var i=0; i < itemNodes.length; i++)
123 {
124 var itemNode = itemNodes[i];
125 var item = new FeedItem();
126 item.feed = aFeed;
127 item.characterSet = "UTF-8";
128
129 var link = getNodeValue(this.childrenByTagNameNS(itemNode, nsURI, "link")[0]);
130 var guidNode = this.childrenByTagNameNS(itemNode, nsURI, "guid")[0];
131 var guid;
132 var isPermaLink;
133 if (guidNode)
134 {
135 guid = getNodeValue(guidNode);
136 isPermaLink = guidNode.getAttribute('isPermaLink') == 'false' ? false : true;
137 item.id = guid;
138 item.isStoredWithId = true;
139 }
140
141 item.url = link ? link : (guid && isPermaLink) ? guid : null;
142 item.description = getNodeValue(this.childrenByTagNameNS(itemNode, nsURI, "description")[0]);
143 item.title = getNodeValue(this.childrenByTagNameNS(itemNode, nsURI, "title")[0])
144 || (item.description ? (this.stripTags(item.description).substr(0, 150)) : null)
145 || item.title;
146
147 item.author = getNodeValue(this.childrenByTagNameNS(itemNode, nsURI, "author")[0]
148 || this.childrenByTagNameNS(itemNode, DC_NS, "creator")[0])
149 || aFeed.title
150 || item.author;
151 item.date = getNodeValue(this.childrenByTagNameNS(itemNode, nsURI, "pubDate")[0]
152 || this.childrenByTagNameNS(itemNode, DC_NS, "date")[0])
153 || item.date;
154
155 if (!item.id)
156 item.id = item.feed.url + '#' + (item.date || item.title);
157
158 // If the date is invalid, users will see the beginning of the epoch
159 // unless we reset it here, so they'll see the current time instead.
160 // This is typical aggregator behavior.
161 if(item.date)
162 {
163 item.date = trimString(item.date);
164 if(!isValidRFC822Date(item.date))
165 {
166 // XXX Use this on the other formats as well
167 item.date = dateRescue(item.date);
168 }
169 }
170
171 var content = getNodeValue(this.childrenByTagNameNS(itemNode, RSS_CONTENT_NS, "encoded")[0]);
172 if(content)
173 item.content = content;
174
175 // Handle an enclosure (if present)
176 var enclosureNode = this.childrenByTagNameNS(itemNode, nsURI, "enclosure")[0];
177 if (enclosureNode)
178 item.enclosure = new FeedEnclosure(enclosureNode.getAttribute("url"),
179 enclosureNode.getAttribute("type"),
180 enclosureNode.getAttribute("length"));
181 parsedItems[i] = item;
182 }
183 return parsedItems;
184 },
185
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
186 parseAsRSS1 : function(aFeed, aSource, aBaseURI)
187 {
188 var parsedItems = new Array();
189
190 // RSS 1.0 is valid RDF, so use the RDF parser/service to extract data.
191 // Create a new RDF data source and parse the feed into it.
192 var ds = Components.classes["@mozilla.org/rdf/datasource;1?name=in-memory-datasource"]
193 .createInstance(Components.interfaces.nsIRDFDataSource);
194
195 rdfparser.parseString(ds, aBaseURI, aSource);
196
197 // Get information about the feed as a whole.
198 var channel = ds.GetSource(RDF_TYPE, RSS_CHANNEL, true);
199
200 aFeed.title = aFeed.title || getRDFTargetValue(ds, channel, RSS_TITLE) || aFeed.url;
201 aFeed.description = getRDFTargetValue(ds, channel, RSS_DESCRIPTION) || "";
202 aFeed.link = getRDFTargetValue(ds, channel, RSS_LINK) || aFeed.url;
203
204 if (!aFeed.parseItems)
205 return parsedItems;
206
207 aFeed.invalidateItems();
208
209 var items = ds.GetTarget(channel, RSS_ITEMS, true);
210 if (items)
211 items = rdfcontainer.MakeSeq(ds, items).GetElements();
212
213 // If the channel doesn't list any items, look for resources of type "item"
214 // (a hacky workaround for some buggy feeds).
215 if (!items || !items.hasMoreElements())
216 items = ds.GetSources(RDF_TYPE, RSS_ITEM, true);
217
218 var index = 0;
219 while (items.hasMoreElements())
220 {
221 var itemResource = items.getNext().QueryInterface(Components.interfaces.nsIRDFResource);
222 var item = new FeedItem();
223 item.feed = aFeed;
224 item.characterSet = "UTF-8";
225
226 // Prefer the value of the link tag to the item URI since the URI could be
227 // a relative URN.
228 var uri = itemResource.Value;
229 var link = getRDFTargetValue(ds, itemResource, RSS_LINK);
230
231 // XXX
232 // check for bug258465 -- entities appear escaped
233 // in the value returned by getRDFTargetValue when they shouldn't
234 //debug("link comparison\n" + " uri: " + uri + "\nlink: " + link);
235
236 item.url = link || uri;
237 item.id = item.url;
238 item.description = getRDFTargetValue(ds, itemResource, RSS_DESCRIPTION);
239 item.title = getRDFTargetValue(ds, itemResource, RSS_TITLE)
240 || getRDFTargetValue(ds, itemResource, DC_SUBJECT)
241 || (item.description ? (this.stripTags(item.description).substr(0, 150)) : null)
242 || item.title;
243 item.author = getRDFTargetValue(ds, itemResource, DC_CREATOR)
244 || getRDFTargetValue(ds, channel, DC_CREATOR)
245 || aFeed.title
246 || item.author;
247
248 item.date = getRDFTargetValue(ds, itemResource, DC_DATE) || item.date;
249 item.content = getRDFTargetValue(ds, itemResource, RSS_CONTENT_ENCODED);
250
251 parsedItems[index++] = item;
252 }
253
254 return parsedItems;
255 },
256
parseAsAtom
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
257 parseAsAtom: function(aFeed, aDOM)
258 {
259 var parsedItems = new Array();
260
261 // Get the first channel (assuming there is only one per Atom File).
262 var channel = aDOM.getElementsByTagName("feed")[0];
263 if (!channel)
264 {
265 aFeed.onParseError(aFeed);
266 return parsedItems;
267 }
268
269 aFeed.title = aFeed.title || this.stripTags(getNodeValue(this.childrenByTagNameNS(channel, ATOM_03_NS, "title")[0]));
270 aFeed.description = getNodeValue(this.childrenByTagNameNS(channel, ATOM_03_NS, "tagline")[0]);
271 aFeed.link = this.findAtomLink("alternate",this.childrenByTagNameNS(channel, ATOM_03_NS, "link"));
272
273 if (!aFeed.parseItems)
274 return parsedItems;
275
276 aFeed.invalidateItems();
277 var items = this.childrenByTagNameNS(channel, ATOM_03_NS, "entry");
278 debug("Items to parse: " + items.length);
279
280 for (var i=0; i < items.length; i++)
281 {
282 var itemNode = items[i];
283 var item = new FeedItem();
284 item.feed = aFeed;
285 item.characterSet = "UTF-8";
286
287 var url;
288 url = this.findAtomLink("alternate",this.childrenByTagNameNS(itemNode, ATOM_03_NS, "link"));
289
290 item.url = url;
291 item.id = getNodeValue(this.childrenByTagNameNS(itemNode, ATOM_03_NS, "id")[0]);
292 item.description = getNodeValue(this.childrenByTagNameNS(itemNode, ATOM_03_NS, "summary")[0]);
293 item.title = getNodeValue(this.childrenByTagNameNS(itemNode, ATOM_03_NS, "title")[0])
294 || (item.description ? item.description.substr(0, 150) : null)
295 || item.title;
296
297 var authorEl = this.childrenByTagNameNS(itemNode, ATOM_03_NS, "author")[0]
298 || this.childrenByTagNameNS(itemNode, ATOM_03_NS, "contributor")[0]
299 || this.childrenByTagNameNS(channel, ATOM_03_NS, "author")[0];
300 var author = "";
301
302 if (authorEl)
303 {
304 var name = getNodeValue(this.childrenByTagNameNS(authorEl, ATOM_03_NS, "name")[0]);
305 var email = getNodeValue(this.childrenByTagNameNS(authorEl, ATOM_03_NS, "email")[0]);
306 if (name)
307 author = name + (email ? " <" + email + ">" : "");
308 else if (email)
309 author = email;
310 }
311
312 item.author = author || item.author || aFeed.title;
313
314 item.date = getNodeValue(this.childrenByTagNameNS(itemNode, ATOM_03_NS, "modified")[0]
315 || this.childrenByTagNameNS(itemNode, ATOM_03_NS, "issued")[0]
316 || this.childrenByTagNameNS(itemNode, ATOM_03_NS, "created")[0])
317 || item.date;
318
319 // XXX We should get the xml:base attribute from the content tag as well
320 // and use it as the base HREF of the message.
321 // XXX Atom feeds can have multiple content elements; we should differentiate
322 // between them and pick the best one.
323 // Some Atom feeds wrap the content in a CTYPE declaration; others use
324 // a namespace to identify the tags as HTML; and a few are buggy and put
325 // HTML tags in without declaring their namespace so they look like Atom.
326 // We deal with the first two but not the third.
327
328 var content;
329 var contentNode = this.childrenByTagNameNS(itemNode, ATOM_03_NS, "content")[0];
330 if (contentNode)
331 {
332 content = "";
333 for (var j=0; j < contentNode.childNodes.length; j++)
334 {
335 var node = contentNode.childNodes.item(j);
336 if (node.nodeType == node.CDATA_SECTION_NODE)
337 content += node.data;
338 else
339 content += serializer.serializeToString(node);
340 }
341
342 if (contentNode.getAttribute('mode') == "escaped")
343 {
344 content = content.replace(/</g, "<");
345 content = content.replace(/>/g, ">");
346 content = content.replace(/&/g, "&");
347 }
348
349 if (content == "")
350 content = null;
351 }
352
353 item.content = content;
354 parsedItems[i] = item;
355 }
356 return parsedItems;
357 },
358
parseAsAtomIETF
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
359 parseAsAtomIETF: function(aFeed, aDOM)
360 {
361
362 var parsedItems = new Array();
363
364 // Get the first channel (assuming there is only one per Atom File).
365 var channel = this.childrenByTagNameNS(aDOM,ATOM_IETF_NS,"feed")[0];
366 if (!channel)
367 {
368 aFeed.onParseError(aFeed);
369 return parsedItems;
370 }
371
372 aFeed.title = aFeed.title || this.stripTags(this.serializeTextConstruct(this.childrenByTagNameNS(channel,ATOM_IETF_NS,"title")[0]));
373 aFeed.description = this.serializeTextConstruct(this.childrenByTagNameNS(channel,ATOM_IETF_NS,"subtitle")[0]);
374 aFeed.link = this.findAtomLink("alternate", this.childrenByTagNameNS(channel,ATOM_IETF_NS,"link"));
375
376 if (!aFeed.parseItems)
377 return parsedItems;
378
379 aFeed.invalidateItems();
380 var items = this.childrenByTagNameNS(channel,ATOM_IETF_NS,"entry");
381 debug("Items to parse: " + items.length);
382
383 for (var i=0; i < items.length; i++)
384 {
385 var itemNode = items[i];
386 var item = new FeedItem();
387 item.feed = aFeed;
388 item.characterSet = "UTF-8";
389 item.isStoredWithId = true;
390 item.url = this.findAtomLink("alternate", this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "link")) || aFeed.link;
391 item.id = getNodeValue(this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "id")[0]);
392 item.description = this.serializeTextConstruct(this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "summary")[0]);
393 item.title = this.stripTags(this.serializeTextConstruct(this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "title")[0])
394 || (item.description ? item.description.substr(0, 150) : null)
395 || item.title);
396
397 // XXX Support multiple authors
398 var source = this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "source")[0];
399 var authorEl = this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "author")[0]
400 || (source ? this.childrenByTagNameNS(source, ATOM_IETF_NS, "author")[0] : null)
401 || this.childrenByTagNameNS(channel, ATOM_IETF_NS, "author")[0];
402 var author = "";
403
404 if (authorEl)
405 {
406 var name = getNodeValue(this.childrenByTagNameNS(authorEl, ATOM_IETF_NS, "name")[0]);
407 var email = getNodeValue(this.childrenByTagNameNS(authorEl, ATOM_IETF_NS, "email")[0]);
408 if (name)
409 author = name + (email ? " <" + email + ">" : "");
410 else if (email)
411 author = email;
412 }
413
414 item.author = author || item.author || aFeed.title;
415 item.date = getNodeValue(this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "updated")[0]
416 || this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "published")[0])
417 || item.date;
418
419 item.content = this.serializeTextConstruct(this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "content")[0]);
420
421 if(item.content)
422 item.xmlContentBase = this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "content")[0].baseURI;
423 else if(item.description)
424 item.xmlContentBase = this.childrenByTagNameNS(itemNode, ATOM_IETF_NS, "summary")[0].baseURI;
425 else
426 item.xmlContentBase = itemNode.baseURI;
427
428 parsedItems[i] = item;
429 }
430
431 return parsedItems;
432
433 },
434
serializeTextConstruct
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
435 serializeTextConstruct: function(textElement)
436 {
437 var content = "";
438
439 if (textElement)
440 {
441 var textType = textElement.getAttribute('type');
442
443 // Atom spec says consider it "text" if not present
444 if(!textType)
445 textType = "text";
446
447 // There could be some strange content type we don't handle
448 if((textType != "text") && (textType != "html") && (textType != "xhtml"))
449 return null;
450
451 for (var j=0; j < textElement.childNodes.length; j++)
452 {
453 var node = textElement.childNodes.item(j);
454 if (node.nodeType == node.CDATA_SECTION_NODE)
455 content += this.xmlEscape(node.data);
456 else
457 content += serializer.serializeToString(node);
458 }
459 if (textType == "html")
460 content = this.xmlUnescape(content);
461 }
462
463 // other parts of the code depend on this being null
464 // if there's no content
465 return content ? content : null;
466 },
467
468 // finds elements that are direct children of the first arg
childrenByTagNameNS
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
469 childrenByTagNameNS: function(aElement, aNamespace, aTagName)
470 {
471 var matches = aElement.getElementsByTagNameNS(aNamespace, aTagName);
472 var matchingChildren = new Array();
473 for (var i = 0; i < matches.length; i++)
474 {
475 if(matches[i].parentNode == aElement)
476 matchingChildren.push(matches[i])
477 }
478 return matchingChildren;
479 },
480
findAtomLink
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
481 findAtomLink: function(linkRel, linkElements)
482 {
483 // XXX Need to check for MIME type and hreflang
484 for ( var j=0 ; j<linkElements.length ; j++ ) {
485 var alink = linkElements[j];
486 if (alink &&
487 //if there's a link rel
488 ((alink.getAttribute('rel') && alink.getAttribute('rel') == linkRel) ||
489 //if there isn't, assume 'alternate'
490 (!alink.getAttribute('rel') && (linkRel=="alternate")))
491 && alink.getAttribute('href'))
492 {
493 // Atom links are interpreted relative to xml:base
494 var ioService = Components.classes["@mozilla.org/network/io-service;1"]
495 .getService(Components.interfaces.nsIIOService);
496 url = ioService.newURI(alink.baseURI, null, null);
497 return url.resolve(alink.getAttribute('href'));
498 }
499 }
500 return null;
501 },
502
stripTags
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
503 stripTags: function(someHTML)
504 {
505 return someHTML ? someHTML.replace(/<[^>]+>/g,"") : someHTML;
506 },
507
xmlUnescape
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
508 xmlUnescape: function(s)
509 {
510 s = s.replace(/</g, "<");
511 s = s.replace(/>/g, ">");
512 s = s.replace(/&/g, "&");
513 return s;
514 },
515
xmlEscape
(0 calls, 0 incl. v-uS, 0 excl. v-uS)
516 xmlEscape: function(s)
517 {
518 s = s.replace(/&/g, "&");
519 s = s.replace(/>/g, ">");
520 s = s.replace(/</g, "<");
521 return s;
522 }
523 };