Comments (8)
I've never written a Python C extension but here is what an LLM came up with as a precursor:
#include <Python.h>
#include <capstone/capstone.h>
// A struct to hold the Capstone handle and the current instruction
typedef struct {
csh handle;
cs_insn *insn;
} DisassemblerObject;
// The __del__ method of the Disassembler class
static void Disassembler_dealloc(DisassemblerObject* self)
{
cs_free(self->insn, 1);
cs_close(&self->handle);
Py_TYPE(self)->tp_free((PyObject*)self);
}
// The __init__ method of the Disassembler class
static int Disassembler_init(DisassemblerObject* self, PyObject* args)
{
// Initialize the Capstone handle
if (cs_open(CS_ARCH_X86, CS_MODE_64, &self->handle) != CS_ERR_OK) {
return -1;
}
// Allocate memory for the current instruction
self->insn = cs_malloc(self->handle);
if (!self->insn) {
cs_close(&self->handle);
return -1;
}
return 0;
}
// The disasm method of the Disassembler class
static PyObject* Disassembler_disasm(DisassemblerObject* self, PyObject* args)
{
const uint8_t* code;
size_t size;
uint64_t address;
// Parse the Python arguments
if (!PyArg_ParseTuple(args, "y#K", &code, &size, &address)) {
return NULL;
}
// Use cs_disasm_iter to disassemble the next instruction
if (!cs_disasm_iter(self->handle, &code, &size, &address, self->insn)) {
return NULL;
}
// Create a Python tuple to return
PyObject* result = Py_BuildValue("(s,K)", self->insn->mnemonic, self->insn->address);
return result;
}
// The methods of the Disassembler class
static PyMethodDef Disassembler_methods[] = {
{"disasm", (PyCFunction)Disassembler_disasm, METH_VARARGS, "Disassemble the next instruction"},
{NULL} // Sentinel
};
// The Disassembler type object
static PyTypeObject DisassemblerType = {
PyVarObject_HEAD_INIT(NULL, 0)
.tp_name = "capstone.Disassembler",
.tp_doc = "Disassembler objects",
.tp_basicsize = sizeof(DisassemblerObject),
.tp_itemsize = 0,
.tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
.tp_new = PyType_GenericNew,
.tp_init = (initproc) Disassembler_init,
.tp_dealloc = (destructor) Disassembler_dealloc,
.tp_methods = Disassembler_methods,
};
// The module's initialization function
PyMODINIT_FUNC PyInit_capstone(void)
{
PyObject* m;
if (PyType_Ready(&DisassemblerType) < 0) {
return NULL;
}
m = PyModule_Create(&moduledef);
if (m == NULL) {
return NULL;
}
Py_INCREF(&DisassemblerType);
PyModule_AddObject(m, "Disassembler", (PyObject *)&DisassemblerType);
return m;
}
Looks sensible to me and does the whole 1-memory alloc thing.
from capstone.
I guess this package using ctypes (which I had to learn) -- tried it myself but got SEGFAULT
diff --git a/sqlelf/elf/instruction.py b/sqlelf/elf/instruction.py
index 68acf50..9a90706 100644
--- a/sqlelf/elf/instruction.py
+++ b/sqlelf/elf/instruction.py
@@ -8,6 +8,18 @@ import apsw.ext
import capstone
import lief
+import ctypes
+
+# Define the cs_disasm_iter function
+cs_disasm_iter = capstone._cs.cs_disasm_iter
+cs_disasm_iter.argtypes = [
+ ctypes.POINTER(ctypes.c_size_t),
+ ctypes.POINTER(ctypes.POINTER(ctypes.c_ubyte)),
+ ctypes.POINTER(ctypes.c_size_t),
+ ctypes.POINTER(ctypes.c_uint64),
+ ctypes.POINTER(capstone._cs_insn),
+]
+
def elf_instructions(binaries: list[lief.Binary]):
def generator() -> Iterator[dict[str, Any]]:
@@ -28,6 +40,23 @@ def elf_instructions(binaries: list[lief.Binary]):
# super important that these accessors are pulled out
# of the tight loop as they can be costly
section_name = section.name
+
+ # Allocate a cs_insn struct
+ code_array = (ctypes.c_ubyte * len(data))(*data)
+ code_array = ctypes.cast(code_array, ctypes.POINTER(ctypes.c_ubyte))
+ insn = capstone._cs_insn()
+ size = ctypes.c_size_t(len(data))
+ address = ctypes.c_uint64(section.virtual_address)
+ while cs_disasm_iter(
+ md.csh,
+ ctypes.byref(code_array),
+ ctypes.byref(size),
+ ctypes.byref(address),
+ ctypes.byref(insn),
+ ):
+ print("here")
+ print(insn)
+
for (address, size, mnemonic, op_str) in md.disasm_lite(
data, section.virtual_address
):
from capstone.
Here is a working patch I got -- suprising it was slower than the lite version.
diff --git a/sqlelf/elf/instruction.py b/sqlelf/elf/instruction.py
index 68acf50..5325b27 100644
--- a/sqlelf/elf/instruction.py
+++ b/sqlelf/elf/instruction.py
@@ -8,6 +8,18 @@ import apsw.ext
import capstone
import lief
+import ctypes
+
+# Define the cs_disasm_iter function
+cs_disasm_iter = capstone._cs.cs_disasm_iter
+cs_disasm_iter.argtypes = [
+ ctypes.c_size_t,
+ ctypes.POINTER(ctypes.POINTER(ctypes.c_ubyte)),
+ ctypes.POINTER(ctypes.c_size_t),
+ ctypes.POINTER(ctypes.c_uint64),
+ ctypes.POINTER(capstone._cs_insn),
+]
+
def elf_instructions(binaries: list[lief.Binary]):
def generator() -> Iterator[dict[str, Any]]:
@@ -28,15 +40,26 @@ def elf_instructions(binaries: list[lief.Binary]):
# super important that these accessors are pulled out
# of the tight loop as they can be costly
section_name = section.name
- for (address, size, mnemonic, op_str) in md.disasm_lite(
- data, section.virtual_address
+
+ # Allocate a cs_insn struct
+ code_array = (ctypes.c_ubyte * len(data))()
+ code_array = ctypes.cast(code_array, ctypes.POINTER(ctypes.c_ubyte))
+ insn = capstone._cs_insn()
+ size = ctypes.c_size_t(len(data))
+ address = ctypes.c_uint64(section.virtual_address)
+ while cs_disasm_iter(
+ md.csh,
+ ctypes.byref(code_array),
+ ctypes.byref(size),
+ ctypes.byref(address),
+ ctypes.byref(insn),
):
yield {
"path": binary_name,
"section": section_name,
- "mnemonic": mnemonic,
+ "mnemonic": insn.mnemonic.decode("ascii"),
"address": address,
- "operands": op_str,
+ "operands": insn.op_str.decode("ascii"),
}
return generator
TY to @markrwilliams for helping me debug it.
from capstone.
Thanks for the patch! Would you mind open a PR with it?
from capstone.
@Rot127 I can open a PR with this patch but I found the performance to be lacking.
It doesn't give the same performance characteristics as the "_lite" version -- I suspect there is a lot of wasted time crossing the FFI boundary?
Subjectively it was 50% slower
Do you want the patch regardless and maybe someone else can tinker as to why it's slower?
from capstone.
from capstone.
@fzakaria Thanks for the patch, we are looking forward to your PR.
from capstone.
PR is up :)
from capstone.
Related Issues (20)
- [Auto-Sync] Add option Differ to update the old file.
- cstool -d x64 "4c 85 7d 30" doesn't give the correct register reads after sync with LLVM 7.0.1
- 5.0.0post1 Mac universal wheel is invalid for AArch64 HOT 2
- arm64 unimplemented instructions HOT 1
- SPDX and reuse lint
- Python API - missing instuction.reg_read/write values HOT 4
- [Auto-Sync] Translate templates to functions and not macros
- arm64: missing register in regs_access() for ldr instruction HOT 2
- x86: vmovsd has incorrect access for operands HOT 2
- vs2022 msvc143 build error HOT 3
- How to determine whether the CS_GRP_JUMP is a conditional jump or unconditional jump?? HOT 1
- 5.0.1 has broken ARM operand information (in Python 2) HOT 10
- 5.0.1 has more broken 'NOREGNAME' syntax on ARM32. HOT 6
- 5.0.0 & 5.0.1 do not install in python 3 on macOS (cannot load dynamic library) HOT 3
- Incomplete disassembly for x86 ud0 and ud1 HOT 1
- Sifting instruction encodings on ARM64, many capstone unsupported encodings discovered HOT 4
- [Auto-Sync] Generate general instruction encoding format HOT 1
- Windows precompiled binaries
- Universal wheel for macOS contains only x86_64 build HOT 9
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from capstone.