Bug 774914/785145 - Convert divs with only a <p> element child into plain <p> elements (r=mfinkle)

This commit is contained in:
Lucas Rocha 2012-08-25 11:27:27 +01:00
parent 19a391a60a
commit 2be189d784

View File

@ -458,18 +458,29 @@ Readability.prototype = {
// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
if (node.innerHTML.search(this.REGEXPS.divToPElements) === -1) {
let newNode = doc.createElement('p');
newNode.innerHTML = node.innerHTML;
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
let pIndex = this._getSinglePIndexInsideDiv(node);
if (node.innerHTML.search(this.REGEXPS.divToPElements) === -1 || pIndex >= 0) {
let newNode;
if (pIndex >= 0) {
newNode = node.childNodes[pIndex];
} else {
newNode = doc.createElement('p');
newNode.innerHTML = node.innerHTML;
// Manually update allElements since it is not a live NodeList
newNode._index = nodeIndex;
allElements[nodeIndex] = newNode;
nodesToScore[nodesToScore.length] = newNode;
}
node.parentNode.replaceChild(newNode, node);
// Manually update allElements since it is not a live NodeList
newNode._index = nodeIndex;
allElements[nodeIndex] = newNode;
purgeNode(node);
nodeIndex -= 1;
nodesToScore[nodesToScore.length] = node;
} else {
// EXPERIMENTAL
for (let i = 0, il = node.childNodes.length; i < il; i += 1) {
@ -709,6 +720,36 @@ Readability.prototype = {
}
},
/**
* Get child index of the only P element inside a DIV with no
* text content. Returns -1 if the DIV node contains non-empty
* text nodes or if it contains other element nodes.
*
* @param Element
**/
_getSinglePIndexInsideDiv: function(e) {
let childNodes = e.childNodes;
let pIndex = -1;
for (let i = childNodes.length; --i >= 0;) {
let node = childNodes[i];
if (node.nodeType === Node.ELEMENT_NODE) {
if (node.tagName !== "P")
return -1;
if (pIndex >= 0)
return -1;
pIndex = i;
} else if (node.nodeType == Node.TEXT_NODE && this._getInnerText(node, false)) {
return -1;
}
}
return pIndex;
},
/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.