Giter Club home page Giter Club logo

org-protocol-capture-html's Introduction

org-protocol-capture-html

org-protocol is awesome, but browsers do a pretty poor job of turning a page’s HTML content into plain-text. However, Pandoc supports converting from HTML to org-mode, so we can use it to turn HTML into Org-mode content! It can even turn HTML tables into Org tables!

Screenshot

Here’s an example of what you get in Emacs from capturing this page:

screenshot.png

Contents

Requirements

  • org-protocol: This is what connects org-mode to the “outside world” using a MIME protocol handler. The instructions on the org-protocol page are a bit out of date, so you might want to try these instructions instead.
  • Pandoc: I’m currently using Pandoc from Ubuntu Trusty, at version 1.12.2.1, and it is able to convert from HTML to org.

Bookmarklet

HTML-grabbing function

This function gets the HTML from the browser’s selection. It’s from this answer on StackOverflow.

function () {
    var html = "";

    if (typeof content.document.getSelection != "undefined") {
        var sel = content.document.getSelection();
        if (sel.rangeCount) {
            var container = document.createElement("div");
            for (var i = 0, len = sel.rangeCount; i < len; ++i) {
                container.appendChild(sel.getRangeAt(i).cloneContents());
            }
            html = container.innerHTML;
        }
    } else if (typeof document.selection != "undefined") {
        if (document.selection.type == "Text") {
            html = document.selection.createRange().htmlText;
        }
    }

    var relToAbs = function (href) {
        var a = content.document.createElement("a");
        a.href = href;
        var abs = a.protocol + "//" + a.host + a.pathname + a.search + a.hash;
        a.remove();
        return abs;
    };
    var elementTypes = [
        ['a', 'href'],
        ['img', 'src']
    ];

    var div = content.document.createElement('div');
    div.innerHTML = html;

    elementTypes.map(function(elementType) {
        var elements = div.getElementsByTagName(elementType[0]);
        for (var i = 0; i < elements.length; i++) {
            elements[i].setAttribute(elementType[1], relToAbs(elements[i].getAttribute(elementType[1])));
        }
    });
    return div.innerHTML;
}

Here’s a one-line version of it, better for pasting into bookmarklets and such:

function () {var html = ""; if (typeof content.document.getSelection != "undefined") {var sel = content.document.getSelection(); if (sel.rangeCount) {var container = document.createElement("div"); for (var i = 0, len = sel.rangeCount; i < len; ++i) {container.appendChild(sel.getRangeAt(i).cloneContents());} html = container.innerHTML;}} else if (typeof document.selection != "undefined") {if (document.selection.type == "Text") {html = document.selection.createRange().htmlText;}} var relToAbs = function (href) {var a = content.document.createElement("a"); a.href = href; var abs = a.protocol + "//" + a.host + a.pathname + a.search + a.hash; a.remove(); return abs;}; var elementTypes = [['a', 'href'], ['img', 'src']]; var div = content.document.createElement('div'); div.innerHTML = html; elementTypes.map(function(elementType) {var elements = div.getElementsByTagName(elementType[0]); for (var i = 0; i < elements.length; i++) {elements[i].setAttribute(elementType[1], relToAbs(elements[i].getAttribute(elementType[1])));}}); return div.innerHTML;}

Bookmarklet

That function goes in the bookmarklet, and the org-protocol sub-protocol is changed to capture-html:, resulting in this:

content.location.href = 'org-protocol://capture-html://w/' + encodeURIComponent(content.location.href) + '/' + encodeURIComponent(content.document.title) + '/' + encodeURIComponent(function () {var html = ""; if (typeof content.document.getSelection != "undefined") {var sel = content.document.getSelection(); if (sel.rangeCount) {var container = document.createElement("div"); for (var i = 0, len = sel.rangeCount; i < len; ++i) {container.appendChild(sel.getRangeAt(i).cloneContents());} html = container.innerHTML;}} else if (typeof document.selection != "undefined") {if (document.selection.type == "Text") {html = document.selection.createRange().htmlText;}} var relToAbs = function (href) {var a = content.document.createElement("a"); a.href = href; var abs = a.protocol + "//" + a.host + a.pathname + a.search + a.hash; a.remove(); return abs;}; var elementTypes = [['a', 'href'], ['img', 'src']]; var div = content.document.createElement('div'); div.innerHTML = html; elementTypes.map(function(elementType) {var elements = div.getElementsByTagName(elementType[0]); for (var i = 0; i < elements.length; i++) {elements[i].setAttribute(elementType[1], relToAbs(elements[i].getAttribute(elementType[1])));}}); return div.innerHTML;}());

Note: I use the Pentadactyl extension, so I had to use content.location.href instead of location.href, content.document instead of window, and content.document.selection instead of window.getSelection(). This might work in plain Firefox too, or you might need to adjust it.

Emacs

Put org-protocol-capture-html.el in your load-path and add to your init file:

(require 'org-protocol-capture-html)

Shell script

The shell script is handy for piping any HTML (or plain-text) content to Org through the shell, but it’s not required.

To-Do

Handle long chunks of HTML

If you try to capture too long a chunk of HTML, it will fail with “argument list too long errors” from emacsclient. To work around this will require capturing via STDIN instead of arguments. Since org-protocol is based on using URLs, this will probably require using a shell script and a new Emacs function, and perhaps another MIME protocol-handler. Even then, it might still run into problems, because the data is passed to the shell script as an argument in the protocol-handler. Working around that would probably require a non-protocol-handler-based method using a browser extension to send the HTML directly via STDIN. Might be possible with Pentadactyl instead of making an entirely new browser extension. Also, maybe the Org-mode Capture Firefox extension could be extended (…) to do this.

org-protocol-capture-html's People

Contributors

alphapapa avatar

Watchers

 avatar

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.