10. June 2013

Finding the main content element on a page in javascript

Short of going to something more complex like measuring information or doing some natural language processing, you can estimate which element on a page contains the content by determining which element has the highest ratio of contained content to contained markup. Here’s a javascript snippet that does just that:

// not perfect obviously. Not terrible neither.

var id, tag;
var all = document.querySelectorAll('body *'), max = 0, el, i, L;

// list some commons ids that denote the outermost element on a page.
var badIds = {
    "wrapper" : 1,
    "container" : 1,
    "wrapper-content" : 1
};

// we don't want to include content from certain tags.
var badTags = {
    "SCRIPT" : 1,
    "STYLE" : 1,
    "HEADER" : 1
}

// the goal rate of markup per content
var contentPercent = 0.45;

var contentRatio = function(el) {
    var i, L, totalScript = 0, scripts = el.getElementsByTagName("script");
    for (i =0, L= scripts.length; i < L; i++) {
        totalScript += scripts[i].length;
    }
    totalScript = 0;
    return (el.textContent.length - totalScript) / el.innerHTML.length;
};

for (i = 0, L =all.length; i < L; i++) {
    id = all[i].getAttribute('id');
    tag = all[i].tagName;
    if (all[i].textContent && all[i].textContent.length > max && (contentRatio(all[i]) > contentPercent) && !badIds[id] && !badTags[tag]) {
        max = all[i].textContent.length;
        el = all[i];
    }
}

// show the results.
console.log(el)
console.log(el.textContent.length / el.innerHTML.length)

30. May 2013

Simple Markov Chain in PHP

Here’s a simple Markov chain implementation in PHP, loosely adapted from this excellent write up about implementing Markov chains in javascript:

class Link {
    private $nexts = array();
    public function addNextWord($word) {
        if (!is_string($word)) {
            throw new Exception('addNextWord method in Link class is run with an string parameter');
        }
        if (!isset($this->nexts[$word])) {
            $this->nexts[$word] = 0;
        }
        $this->nexts[$word]++;
    }
    public function getNextWord() {
        $total = 0;
        foreach($this->nexts as $word => $count) {
            $total += $count;
        }
        $randomIndex = rand(1, $total);
        $total = 0;
        foreach($this->nexts as $word => $count) {
            $total += $count;
            if ($total >= $randomIndex) {
                return $word;
            }
        }
    }
}

class Chain {
    private $words = array();
    function __construct($words) {
        if (!is_array($words)) {
            throw new Exception('Chain class is instantiated with an array');
        }

        for($i = 0; $i < count($words); $i++) {
            $word = (string) $words[$i];
            if (!isset($this->words[$word])) {
                $this->words[$word] = new Link();
            }
            if (isset($words[$i + 1])) {
                $this->words[$word]->addNextWord($words[$i + 1]);
            }
        }
    }
    public function getChainOfLength($word, $i) {
        if (!is_string($word)) {
            throw new Exception('getChainOfLength method in Chain class is run with an string parameter');
        }
        if (!is_integer($i)) {
            throw new Exception('getChainOfLength method should be called with an integer');
        }
        if (!isset($this->words[$word])) {
            return '';
        } else {
            $chain = array($word);
            for ($j = 0; $j < $i; $j++) {
                $word = $this->words[$word]->getNextWord();
                $chain[] = $word;
            }
            return implode(' ', $chain);
        }
    }
}

And here is an example of usage:

function get_all_words_in_file($file) {
    return preg_split('/s+/ ', file_get_contents($file));
}

$file = 'testtext2.txt';

$words = get_all_words_in_file($file);
$chain = new Chain($words);
$newSentence = $chain->getChainOfLength('The', 200);
echo wordwrap($newSentence, 80, "n");

Conceptually, a Markov chain captures the idea of likelihood of traversing from state to state. You can populate this data for a block of text by passing through a block of text and counting the number of occurrences of words that follow a given word. You can then use this data to generate new blocks of text.

24. May 2013

A Mandelbrot Set Viewer via Javascript and Canvas

In mathematics, a fixed point is an input x for a function f such that

f(x) = x

The Mandelbrot Set is a visualization of which points near 0 in the complex plane are fixed points for the function f(z) = z^{2} - 1, where zinmathbb{C}. Points that are fixed are rendered in black, while all other points are colour-coded based on how quickly repeated application of the function to a point diverges.

Images of the Mandelbrot set should be familiar to most everyone. It’s an infinitely detailed, self-similar set of rainbows surrounding bubbles.

canvas6canvas

canvas2canvas3

canvas4canvas5

So here’s my javascript and canvas based Mandlebrot viewer:

var xScale, xOffext, yScale, yOffset, xVal, yVal;
var canvas, h, w;
updateScalesAndOffsets = function() {
    xScale = parseFloat(document.getElementById('xScale').value);
    xOffset = parseFloat(document.getElementById('xOffset').value);
    yScale = parseFloat(document.getElementById('yScale').value);
    yOffset = parseFloat(document.getElementById('yOffset').value);
};
updateCanvasAndDimensions = function() {
    canvas = document.getElementById('m');
    h = canvas.getAttribute('height');
    w = canvas.getAttribute('width');
};
doMandelbrot = function() {
    var iteration, max_iteration = 1000, l, x, y, x0, y0, xtemp;
    updateCanvasAndDimensions();
    var ctx = canvas.getContext('2d');
    updateScalesAndOffsets();

    for (var i=0; i < w; i++) {
        for (var j=0;j < h; j++) {
            // for each point in the image, generate the color value.
            x0 = xScale * (i / w) + xOffset;
            y0 = yScale * (j / h) + yOffset;

            x = 0;
            y = 0;

            iteration = 0;

            while (x*x + y*y < 4 && iteration < max_iteration) {
                // this is parametrically performing the complex function f(z) = z^2 -1.
                xtemp = x*x - y*y + x0;
                y = 2*x*y + y0;
                x = xtemp;
                iteration++;
            }

            if (x*x + y*y < 4) {
                ctx.fillStyle='rgb(0,0,0)';
            } else {
                l = iteration < 50? iteration : 50;
                // set colors using hsl so that the number of iterations to diverge maps to the hue.
                ctx.fillStyle='hsl('+Math.floor((iteration/max_iteration)*256)+',100%,' + l + '%)';
            }

            ctx.fillRect(i,j,i+1,j+1);
        }
    }
};
mouseMove = function(e) {


    xVal = xScale * (e.clientX / w) + xOffset;
    yVal = yScale * (e.clientY / h) + yOffset;
    var xCoordinateElement = document.getElementById('xCoordinate'), yCoordinateElement = document.getElementById('yCoordinate');
    xCoordinateElement.innerHTML = xVal;
    yCoordinateElement.innerHTML = yVal;
};

zoomIn = function() {
    document.getElementById('xScale').value = parseFloat(document.getElementById('xScale').value) / 2;
    document.getElementById('xOffset').value = xVal - parseFloat(document.getElementById('xScale').value) / 2;
    document.getElementById('yScale').value = parseFloat(document.getElementById('yScale').value) / 2;
    document.getElementById('yOffset').value = yVal - parseFloat(document.getElementById('yScale').value) / 2;
    doMandelbrot();

}
zoomOut = function() {
    document.getElementById('xScale').value = parseFloat(document.getElementById('xScale').value) * 2;
    document.getElementById('yScale').value = parseFloat(document.getElementById('yScale').value) * 2;
    doMandelbrot();

}
moveDir = function(dir) {
    switch (dir) {
        case 'up' : document.getElementById('yOffset').value = yOffset - yScale / 10; break;
        case 'down' : document.getElementById('yOffset').value = yOffset + yScale / 10; break;
        case 'right' : document.getElementById('xOffset').value = xOffset + xScale / 10; break;
        case 'left' : document.getElementById('xOffset').value = xOffset - xScale / 10; break;
    }
    updateScalesAndOffsets();
    doMandelbrot();
}