org-protocol is awesome, but browsers do a pretty poor job of turning a page’s HTML content into plain-text. However, Pandoc supports converting from HTML to org-mode, so we can use it to turn HTML into Org-mode content! It can even turn HTML tables into Org tables!
Here’s an example of what you get in Emacs from capturing this page:
- org-protocol: This is what connects org-mode to the “outside world” using a MIME protocol handler. The instructions on the org-protocol page are a bit out of date, so you might want to try these instructions instead.
- Pandoc: I’m currently using Pandoc from Ubuntu Trusty, at version 1.12.2.1, and it is able to convert from HTML to org.
This function gets the HTML from the browser’s selection. It’s from this answer on StackOverflow.
function () {
var html = "";
if (typeof content.document.getSelection != "undefined") {
var sel = content.document.getSelection();
if (sel.rangeCount) {
var container = document.createElement("div");
for (var i = 0, len = sel.rangeCount; i < len; ++i) {
container.appendChild(sel.getRangeAt(i).cloneContents());
}
html = container.innerHTML;
}
} else if (typeof document.selection != "undefined") {
if (document.selection.type == "Text") {
html = document.selection.createRange().htmlText;
}
}
var relToAbs = function (href) {
var a = content.document.createElement("a");
a.href = href;
var abs = a.protocol + "//" + a.host + a.pathname + a.search + a.hash;
a.remove();
return abs;
};
var elementTypes = [
['a', 'href'],
['img', 'src']
];
var div = content.document.createElement('div');
div.innerHTML = html;
elementTypes.map(function(elementType) {
var elements = div.getElementsByTagName(elementType[0]);
for (var i = 0; i < elements.length; i++) {
elements[i].setAttribute(elementType[1], relToAbs(elements[i].getAttribute(elementType[1])));
}
});
return div.innerHTML;
}
Here’s a one-line version of it, better for pasting into bookmarklets and such:
function () {var html = ""; if (typeof content.document.getSelection != "undefined") {var sel = content.document.getSelection(); if (sel.rangeCount) {var container = document.createElement("div"); for (var i = 0, len = sel.rangeCount; i < len; ++i) {container.appendChild(sel.getRangeAt(i).cloneContents());} html = container.innerHTML;}} else if (typeof document.selection != "undefined") {if (document.selection.type == "Text") {html = document.selection.createRange().htmlText;}} var relToAbs = function (href) {var a = content.document.createElement("a"); a.href = href; var abs = a.protocol + "//" + a.host + a.pathname + a.search + a.hash; a.remove(); return abs;}; var elementTypes = [['a', 'href'], ['img', 'src']]; var div = content.document.createElement('div'); div.innerHTML = html; elementTypes.map(function(elementType) {var elements = div.getElementsByTagName(elementType[0]); for (var i = 0; i < elements.length; i++) {elements[i].setAttribute(elementType[1], relToAbs(elements[i].getAttribute(elementType[1])));}}); return div.innerHTML;}
That function goes in the bookmarklet, and the org-protocol
sub-protocol is changed to capture-html:
, resulting in this:
content.location.href = 'org-protocol://capture-html://w/' + encodeURIComponent(content.location.href) + '/' + encodeURIComponent(content.document.title) + '/' + encodeURIComponent(function () {var html = ""; if (typeof content.document.getSelection != "undefined") {var sel = content.document.getSelection(); if (sel.rangeCount) {var container = document.createElement("div"); for (var i = 0, len = sel.rangeCount; i < len; ++i) {container.appendChild(sel.getRangeAt(i).cloneContents());} html = container.innerHTML;}} else if (typeof document.selection != "undefined") {if (document.selection.type == "Text") {html = document.selection.createRange().htmlText;}} var relToAbs = function (href) {var a = content.document.createElement("a"); a.href = href; var abs = a.protocol + "//" + a.host + a.pathname + a.search + a.hash; a.remove(); return abs;}; var elementTypes = [['a', 'href'], ['img', 'src']]; var div = content.document.createElement('div'); div.innerHTML = html; elementTypes.map(function(elementType) {var elements = div.getElementsByTagName(elementType[0]); for (var i = 0; i < elements.length; i++) {elements[i].setAttribute(elementType[1], relToAbs(elements[i].getAttribute(elementType[1])));}}); return div.innerHTML;}());
Note: I use the Pentadactyl extension, so I had to use content.location.href
instead of location.href
, content.document
instead of window
, and content.document.selection
instead of window.getSelection()
. This might work in plain Firefox too, or you might need to adjust it.
Put org-protocol-capture-html.el
in your load-path
and add to your init file:
(require 'org-protocol-capture-html)
The shell script is handy for piping any HTML (or plain-text) content to Org through the shell, but it’s not required.
If you try to capture too long a chunk of HTML, it will fail with “argument list too long errors” from emacsclient
. To work around this will require capturing via STDIN instead of arguments. Since org-protocol is based on using URLs, this will probably require using a shell script and a new Emacs function, and perhaps another MIME protocol-handler. Even then, it might still run into problems, because the data is passed to the shell script as an argument in the protocol-handler. Working around that would probably require a non-protocol-handler-based method using a browser extension to send the HTML directly via STDIN. Might be possible with Pentadactyl instead of making an entirely new browser extension. Also, maybe the Org-mode Capture Firefox extension could be extended (…) to do this.