Finding the main content element on a page in javascript

Written by Ben Wendt

Short of going to something more complex like measuring information or doing some natural language processing, you can estimate which element on a page contains the content by determining which element has the highest ratio of contained content to contained markup. Here’s a javascript snippet that does just that:

// not perfect obviously. Not terrible neither.

var id, tag;
var all = document.querySelectorAll('body *'), max = 0, el, i, L;

// list some commons ids that denote the outermost element on a page.
var badIds = {
    "wrapper" : 1,
    "container" : 1,
    "wrapper-content" : 1
};

// we don't want to include content from certain tags.
var badTags = {
    "SCRIPT" : 1,
    "STYLE" : 1,
    "HEADER" : 1
}

// the goal rate of markup per content
var contentPercent = 0.45;

var contentRatio = function(el) {
    var i, L, totalScript = 0, scripts = el.getElementsByTagName("script");
    for (i =0, L= scripts.length; i < L; i++) {
        totalScript += scripts[i].length;
    }
    totalScript = 0;
    return (el.textContent.length - totalScript) / el.innerHTML.length;
};

for (i = 0, L =all.length; i < L; i++) {
    id = all[i].getAttribute('id');
    tag = all[i].tagName;
    if (all[i].textContent && all[i].textContent.length > max && (contentRatio(all[i]) > contentPercent) && !badIds[id] && !badTags[tag]) {
        max = all[i].textContent.length;
        el = all[i];
    }
}

// show the results.
console.log(el)
console.log(el.textContent.length / el.innerHTML.length)