From 84878580e8504a9768eaae48ccccd288b60e666c Mon Sep 17 00:00:00 2001 From: Gijs Kruitbosch Date: Tue, 17 Mar 2015 20:58:58 -0700 Subject: [PATCH] No bug, update Readability.js to the version in github, rs=margaret,me per discussion earlier today --- toolkit/components/reader/Readability.js | 100 +++++++++++++++-------- 1 file changed, 65 insertions(+), 35 deletions(-) diff --git a/toolkit/components/reader/Readability.js b/toolkit/components/reader/Readability.js index 8fa575b8a15..2a05784de50 100644 --- a/toolkit/components/reader/Readability.js +++ b/toolkit/components/reader/Readability.js @@ -25,9 +25,9 @@ * This code is heavily based on Arc90's readability.js (1.7.1) script * available at: http://code.google.com/p/arc90labs-readability */ - +var root = this; var Readability = function(uri, doc) { - const ENABLE_LOGGING = false; + var ENABLE_LOGGING = false; this._uri = uri; this._doc = doc; @@ -53,8 +53,26 @@ var Readability = function(uri, doc) { // Control whether log messages are sent to the console if (ENABLE_LOGGING) { - this.log = function (msg) { - dump("Reader: (Readability) " + msg); + function logEl(e) { + var rv = e.nodeName + " "; + if (e.nodeType == e.TEXT_NODE) { + return rv + '("' + e.textContent + '")'; + } + var classDesc = e.className && ("." + e.className.replace(/ /g, ".")); + var elDesc = e.id ? "(#" + e.id + classDesc + ")" : + (classDesc ? "(" + classDesc + ")" : ""); + return rv + elDesc; + } + this.log = function () { + if ("dump" in root) { + var msg = Array.prototype.map.call(arguments, function(x) { + return (x && x.nodeName) ? logEl(x) : x; + }).join(" "); + dump("Reader: (Readability) " + msg + "\n"); + } else if ("console" in root) { + var args = ["Reader: (Readability) "].concat(arguments); + console.log.apply(console, args); + } }; } else { this.log = function () {}; @@ -203,27 +221,15 @@ Readability.prototype = { _prepDocument: function() { var doc = this._doc; - // In some cases a body element can't be found (if the HTML is - // totally hosed for example) so we create a new body node and - // append it to the document. - if (!doc.body) { - var body = doc.createElement("body"); - - try { - doc.body = body; - } catch(e) { - doc.documentElement.appendChild(body); - this.log(e); - } - } - // Remove all style tags in head var styleTags = doc.getElementsByTagName("style"); for (var st = 0; st < styleTags.length; st += 1) { styleTags[st].textContent = ""; } - this._replaceBrs(doc.body); + if (doc.body) { + this._replaceBrs(doc.body); + } var fonts = doc.getElementsByTagName("FONT"); for (var i = fonts.length; --i >=0;) { @@ -412,6 +418,13 @@ Readability.prototype = { var doc = this._doc; var isPaging = (page !== null ? true: false); page = page ? page : this._doc.body; + + // We can't grab an article if we don't have a page! + if (!page) { + this.log("No body found in document. Abort."); + return null; + } + var pageCacheHtml = page.innerHTML; // Check if any "dir" is set on the toplevel document element @@ -576,8 +589,7 @@ Readability.prototype = { var candidateScore = candidate.readability.contentScore * (1 - this._getLinkDensity(candidate)); candidate.readability.contentScore = candidateScore; - this.log('Candidate: ' + candidate + " (" + candidate.className + ":" + - candidate.id + ") with score " + candidateScore); + this.log('Candidate:', candidate, "with score " + candidateScore); for (var t = 0; t < this.N_TOP_CANDIDATES; t++) { var aTopCandidate = topCandidates[t]; @@ -592,15 +604,18 @@ Readability.prototype = { } var topCandidate = topCandidates[0] || null; + var neededToCreateTopCandidate = false; // If we still have no top candidate, just use the body as a last resort. // We also have to copy the body node so it is something we can modify. if (topCandidate === null || topCandidate.tagName === "BODY") { // Move all of the page's children into topCandidate topCandidate = doc.createElement("DIV"); + neededToCreateTopCandidate = true; var children = page.childNodes; - for (var i = 0; i < children.length; ++i) { - topCandidate.appendChild(children[i]); + while (children.length) { + this.log("Moving child out:", children[0]); + topCandidate.appendChild(children[0]); } page.appendChild(topCandidate); @@ -622,7 +637,7 @@ Readability.prototype = { var siblingNode = siblingNodes[s]; var append = false; - this.log("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : '')); + this.log("Looking at sibling node:", siblingNode, ((typeof siblingNode.readability !== 'undefined') ? ("with score " + siblingNode.readability.contentScore) : '')); this.log("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown')); if (siblingNode === topCandidate) @@ -651,7 +666,7 @@ Readability.prototype = { } if (append) { - this.log("Appending node: " + siblingNode); + this.log("Appending node:", siblingNode); // siblingNodes is a reference to the childNodes array, and // siblingNode is removed from the array when we call appendChild() @@ -663,14 +678,14 @@ Readability.prototype = { if (siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") { // We have a node that isn't a common block level element, like a form or td tag. // Turn it into a div so it doesn't get filtered out later by accident. */ - this.log("Altering siblingNode of " + siblingNode.nodeName + ' to div.'); + this.log("Altering siblingNode:", siblingNode, 'to div.'); this._setNodeTag(siblingNode, "DIV"); } // To ensure a node does not interfere with readability styles, // remove its classnames. - siblingNode.className = ""; + siblingNode.removeAttribute("class"); // Append sibling and subtract from our list because it removes // the node when you append to another node. @@ -678,20 +693,33 @@ Readability.prototype = { } } + this.log("Article content pre-prep: " + articleContent.innerHTML); // So we have all of the content that we need. Now we clean it up for presentation. this._prepArticle(articleContent); + this.log("Article content post-prep: " + articleContent.innerHTML); if (this._curPageNum === 1) { - var div = doc.createElement("DIV"); - div.id = "readability-page-1"; - div.className = "page"; - var children = articleContent.childNodes; - for (var i = 0; i < children.length; ++i) { - div.appendChild(children[i]); + if (neededToCreateTopCandidate) { + // We already created a fake div thing, and there wouldn't have been any siblings left + // for the previous loop, so there's no point trying to create a new div, and then + // move all the children over. Just assign IDs and class names here. No need to append + // because that already happened anyway. + topCandidate.id = "readability-page-1"; + topCandidate.className = "page"; + } else { + var div = doc.createElement("DIV"); + div.id = "readability-page-1"; + div.className = "page"; + var children = articleContent.childNodes; + while (children.length) { + div.appendChild(children[0]); + } + articleContent.appendChild(div); } - articleContent.appendChild(div); } + this.log("Article content after paging: " + articleContent.innerHTML); + // Now that we've gone through the full algorithm, check to see if // we got any meaningful content. If we didn't, we may need to re-run // grabArticle with different flags set. This gives us a higher likelihood of @@ -1401,7 +1429,7 @@ Readability.prototype = { var weight = this._getClassWeight(tagsList[i]); var contentScore = 0; - this.log("Cleaning Conditionally " + tagsList[i] + " (" + tagsList[i].className + ":" + tagsList[i].id + ")"); + this.log("Cleaning Conditionally", tagsList[i]); if (weight + contentScore < 0) { tagsList[i].parentNode.removeChild(tagsList[i]); @@ -1508,6 +1536,8 @@ Readability.prototype = { if (!articleContent) return null; + this.log("Grabbed: " + articleContent.innerHTML); + this._postProcessContent(articleContent); // if (nextPageLink) {