org-protocol-capture-html

org-protocol is awesome, but browsers do a pretty poor job of turning a page’s HTML content into plain-text. However, Pandoc supports converting from HTML to org-mode, so we can use it to turn HTML into Org-mode content! It can even turn HTML tables into Org tables!

Screenshot

Here’s an example of what you get in Emacs from capturing this page:

org-protocol-capture-html
Screenshot
Requirements
Bookmarklet
- HTML-grabbing function
- Bookmarklet
Emacs
Shell script
To-Do
- Handle long chunks of HTML

Requirements

org-protocol: This is what connects org-mode to the “outside world” using a MIME protocol handler. The instructions on the org-protocol page are a bit out of date, so you might want to try these instructions instead.
Pandoc: I’m currently using Pandoc from Ubuntu Trusty, at version 1.12.2.1, and it is able to convert from HTML to org.

Bookmarklet

HTML-grabbing function

This function gets the HTML from the browser’s selection. It’s from this answer on StackOverflow.

function () {
    var html = "";

    if (typeof content.document.getSelection != "undefined") {
        var sel = content.document.getSelection();
        if (sel.rangeCount) {
            var container = document.createElement("div");
            for (var i = 0, len = sel.rangeCount; i < len; ++i) {
                container.appendChild(sel.getRangeAt(i).cloneContents());
            }
            html = container.innerHTML;
        }
    } else if (typeof document.selection != "undefined") {
        if (document.selection.type == "Text") {
            html = document.selection.createRange().htmlText;
        }
    }

    var relToAbs = function (href) {
        var a = content.document.createElement("a");
        a.href = href;
        var abs = a.protocol + "//" + a.host + a.pathname + a.search + a.hash;
        a.remove();
        return abs;
    };
    var elementTypes = [
        ['a', 'href'],
        ['img', 'src']
    ];

    var div = content.document.createElement('div');
    div.innerHTML = html;

    elementTypes.map(function(elementType) {
        var elements = div.getElementsByTagName(elementType[0]);
        for (var i = 0; i < elements.length; i++) {
            elements[i].setAttribute(elementType[1], relToAbs(elements[i].getAttribute(elementType[1])));
        }
    });
    return div.innerHTML;
}

Here’s a one-line version of it, better for pasting into bookmarklets and such:

function () {var html = ""; if (typeof content.document.getSelection != "undefined") {var sel = content.document.getSelection(); if (sel.rangeCount) {var container = document.createElement("div"); for (var i = 0, len = sel.rangeCount; i < len; ++i) {container.appendChild(sel.getRangeAt(i).cloneContents());} html = container.innerHTML;}} else if (typeof document.selection != "undefined") {if (document.selection.type == "Text") {html = document.selection.createRange().htmlText;}} var relToAbs = function (href) {var a = content.document.createElement("a"); a.href = href; var abs = a.protocol + "//" + a.host + a.pathname + a.search + a.hash; a.remove(); return abs;}; var elementTypes = [['a', 'href'], ['img', 'src']]; var div = content.document.createElement('div'); div.innerHTML = html; elementTypes.map(function(elementType) {var elements = div.getElementsByTagName(elementType[0]); for (var i = 0; i < elements.length; i++) {elements[i].setAttribute(elementType[1], relToAbs(elements[i].getAttribute(elementType[1])));}}); return div.innerHTML;}

Bookmarklet

That function goes in the bookmarklet, and the org-protocol sub-protocol is changed to capture-html:, resulting in this:

content.location.href = 'org-protocol://capture-html://w/' + encodeURIComponent(content.location.href) + '/' + encodeURIComponent(content.document.title) + '/' + encodeURIComponent(function () {var html = ""; if (typeof content.document.getSelection != "undefined") {var sel = content.document.getSelection(); if (sel.rangeCount) {var container = document.createElement("div"); for (var i = 0, len = sel.rangeCount; i < len; ++i) {container.appendChild(sel.getRangeAt(i).cloneContents());} html = container.innerHTML;}} else if (typeof document.selection != "undefined") {if (document.selection.type == "Text") {html = document.selection.createRange().htmlText;}} var relToAbs = function (href) {var a = content.document.createElement("a"); a.href = href; var abs = a.protocol + "//" + a.host + a.pathname + a.search + a.hash; a.remove(); return abs;}; var elementTypes = [['a', 'href'], ['img', 'src']]; var div = content.document.createElement('div'); div.innerHTML = html; elementTypes.map(function(elementType) {var elements = div.getElementsByTagName(elementType[0]); for (var i = 0; i < elements.length; i++) {elements[i].setAttribute(elementType[1], relToAbs(elements[i].getAttribute(elementType[1])));}}); return div.innerHTML;}());

Note: I use the Pentadactyl extension, so I had to use content.location.href instead of location.href, content.document instead of window, and content.document.selection instead of window.getSelection(). This might work in plain Firefox too, or you might need to adjust it.

Emacs

Put org-protocol-capture-html.el in your load-path and add to your init file:

(require 'org-protocol-capture-html)

Shell script

The shell script is handy for piping any HTML (or plain-text) content to Org through the shell, but it’s not required.

To-Do

Handle long chunks of HTML

If you try to capture too long a chunk of HTML, it will fail with “argument list too long errors” from emacsclient. To work around this will require capturing via STDIN instead of arguments. Since org-protocol is based on using URLs, this will probably require using a shell script and a new Emacs function, and perhaps another MIME protocol-handler. Even then, it might still run into problems, because the data is passed to the shell script as an argument in the protocol-handler. Working around that would probably require a non-protocol-handler-based method using a browser extension to send the HTML directly via STDIN. Might be possible with Pentadactyl instead of making an entirely new browser extension. Also, maybe the Org-mode Capture Firefox extension could be extended (…) to do this.

nielius / org-protocol-capture-html Goto Github PK