Backed out changeset c7440c022f74 (bug 1158228) for bc1 orange

2024-09-13 09:24:08 -07:00 · 2015-05-01 16:44:52 -07:00 · 2015-05-01 16:44:52 -07:00 · c6978c74f7
commit c6978c74f7
parent 99fc155f8d
1 changed files with 49 additions and 76 deletions
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@ -108,9 +108,6 @@ Readability.prototype = {
  // it quits and just show a link.
  DEFAULT_MAX_PAGES: 5,

-  // Element tags to score by default.
-  DEFAULT_TAGS_TO_SCORE: ["SECTION", "P", "TD", "PRE"],
-
  // All of the regular expressions in use within readability.
  // Defined up here so we don't instantiate them repeatedly in loops.
  REGEXPS: {
@ -122,7 +119,7 @@ Readability.prototype = {
    byline: /byline|author|dateline|writtenby/i,
    replaceFonts: /<(\/?)font[^>]*>/gi,
    normalize: /\s{2,}/g,
-    videos: /https?:\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
+    videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i,
    nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
    prevLink: /(prev|earl|old|new|<|«)/i,
    whitespace: /^\s*$/,
@ -189,15 +186,6 @@ Readability.prototype = {
    return Array.prototype.concat.apply([], nodeLists);
  },

-  _getAllNodesWithTag: function(node, tagNames) {
-    if (node.querySelectorAll) {
-      return node.querySelectorAll(tagNames.join(','));
-    }
-    return [].concat.apply([], tagNames.map(function(tag) {
-      return node.getElementsByTagName(tag);
-    }));
-  },
-
  /**
   * Converts each <a> and <img> uri in the given element to an absolute URI.
   *
@ -598,18 +586,6 @@ Readability.prototype = {
    return false;
  },

-  _getNodeAncestors: function(node, maxDepth) {
-    maxDepth = maxDepth || 0;
-    var i = 0, ancestors = [];
-    while (node.parentNode) {
-      ancestors.push(node.parentNode)
-      if (maxDepth && ++i === maxDepth)
-        break;
-      node = node.parentNode;
-    }
-    return ancestors;
-  },
-
  /***
   * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
   *         most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
@ -664,9 +640,8 @@ Readability.prototype = {
          }
        }

-        if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
+        if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE")
          elementsToScore.push(node);
-        }

        // Turn all divs that don't have children block level elements into p's
        if (node.tagName === "DIV") {
@ -705,18 +680,30 @@ Readability.prototype = {
      **/
      var candidates = [];
      this._forEachNode(elementsToScore, function(elementToScore) {
-        if (!elementToScore.parentNode || typeof(elementToScore.parentNode.tagName) === 'undefined')
+        var parentNode = elementToScore.parentNode;
+        var grandParentNode = parentNode ? parentNode.parentNode : null;
+        var innerText = this._getInnerText(elementToScore);
+
+        if (!parentNode || typeof(parentNode.tagName) === 'undefined')
          return;

        // If this paragraph is less than 25 characters, don't even count it.
-        var innerText = this._getInnerText(elementToScore);
        if (innerText.length < 25)
          return;

-        // Exclude nodes with no ancestor.
-        var ancestors = this._getNodeAncestors(elementToScore, 3);
-        if (ancestors.length === 0)
-          return;
+        // Initialize readability data for the parent.
+        if (typeof parentNode.readability === 'undefined') {
+          this._initializeNode(parentNode);
+          candidates.push(parentNode);
+        }
+
+        // Initialize readability data for the grandparent.
+        if (grandParentNode &&
+          typeof(grandParentNode.readability) === 'undefined' &&
+          typeof(grandParentNode.tagName) !== 'undefined') {
+          this._initializeNode(grandParentNode);
+          candidates.push(grandParentNode);
+        }

        var contentScore = 0;

@ -729,18 +716,11 @@ Readability.prototype = {
        // For every 100 characters in this paragraph, add another point. Up to 3 points.
        contentScore += Math.min(Math.floor(innerText.length / 100), 3);

-        // Initialize and score ancestors.
-        this._forEachNode(ancestors, function(ancestor, level) {
-          if (!ancestor.tagName)
-            return;
+        // Add the score to the parent. The grandparent gets half.
+        parentNode.readability.contentScore += contentScore;

-          if (typeof(ancestor.readability) === 'undefined') {
-            this._initializeNode(ancestor);
-            candidates.push(ancestor);
-          }
-
-          ancestor.readability.contentScore += contentScore / (level === 0 ? 1 : level * 2);
-        });
+        if (grandParentNode)
+          grandParentNode.readability.contentScore += contentScore / 2;
      });

      // After we've calculated scores, loop through all of the possible
@ -868,6 +848,10 @@ Readability.prototype = {
            sibling = this._setNodeTag(sibling, "DIV");
          }

+          // To ensure a node does not interfere with readability styles,
+          // remove its classnames.
+          sibling.removeAttribute("class");
+
          articleContent.appendChild(sibling);
          // siblings is a reference to the children array, and
          // sibling is removed from the array when we call appendChild().
@ -969,7 +953,7 @@ Readability.prototype = {
      var elementName = element.getAttribute("name");
      var elementProperty = element.getAttribute("property");

-      if ([elementName, elementProperty].indexOf("author") !== -1) {
+      if (elementName === "author") {
        metadata.byline = element.getAttribute("content");
        return;
      }
@ -1613,7 +1597,6 @@ Readability.prototype = {

    var tagsList = e.getElementsByTagName(tag);
    var curTagsLength = tagsList.length;
-    var isList = tag === "ul" || tag === "ol";

    // Gather counts for other typical elements embedded within.
    // Traverse backwards so we can remove nodes at the same time
@ -1649,13 +1632,13 @@ Readability.prototype = {
        var toRemove = false;
        if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
          toRemove = true;
-        } else if (!isList && li > p) {
+        } else if (li > p && tag !== "ul" && tag !== "ol") {
          toRemove = true;
-        } else if (input > Math.floor(p/3)) {
+        } else if ( input > Math.floor(p/3) ) {
          toRemove = true;
-        } else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
+        } else if (contentLength < 25 && (img === 0 || img > 2) ) {
          toRemove = true;
-        } else if (!isList && weight < 25 && linkDensity > 0.2) {
+        } else if (weight < 25 && linkDensity > 0.2) {
          toRemove = true;
        } else if (weight >= 25 && linkDensity > 0.5) {
          toRemove = true;
@ -1680,7 +1663,7 @@ Readability.prototype = {
    for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
      var headers = e.getElementsByTagName('h' + headerIndex);
      for (var i = headers.length - 1; i >= 0; i -= 1) {
-        if (this._getClassWeight(headers[i]) < 0)
+        if (this._getClassWeight(headers[i]) < 0 || this._getLinkDensity(headers[i]) > 0.33)
          headers[i].parentNode.removeChild(headers[i]);
      }
    }
@ -1703,42 +1686,32 @@ Readability.prototype = {
   *
   * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
   */
-  isProbablyReaderable: function(helperIsVisible) {
-    var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
+  isProbablyReaderable: function() {
+    var nodes = this._doc.getElementsByTagName("p");
+    if (nodes.length < 5) {
+      return false;
+    }

-    // FIXME we should have a fallback for helperIsVisible, but this is
-    // problematic because of jsdom's elem.style handling - see
-    // https://github.com/mozilla/readability/pull/186 for context.
-
-    var score = 0;
-    // This is a little cheeky, we use the accumulator 'score' to decide what to return from
-    // this callback:
-    return this._someNode(nodes, function(node) {
-      if (helperIsVisible && !helperIsVisible(node))
-        return false;
+    var possibleParagraphs = 0;
+    for (var i = 0; i < nodes.length; i++) {
+      var node = nodes[i];
      var matchString = node.className + " " + node.id;

      if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
          !this.REGEXPS.okMaybeItsACandidate.test(matchString)) {
-        return false;
+        continue;
      }

-      if (node.matches && node.matches("li p")) {
-        return false;
+      if (node.textContent.trim().length < 100) {
+        continue;
      }

-      var textContentLength = node.textContent.trim().length;
-      if (textContentLength < 140) {
-        return false;
-      }
-
-      score += Math.sqrt(textContentLength - 140);
-
-      if (score > 20) {
+      possibleParagraphs++;
+      if (possibleParagraphs >= 5) {
        return true;
      }
-      return false;
-    });
+    }
+    return false;
  },

  /**