I was just trying out clang-rs and the results I'm seeing from a trivial tokenization are pretty wonky. I'll share with you the simplest cases I've managed to pinpoint and hopefully you can shed some light on what's going on.
The first issue I was seeing was with partial token results. Note that the last expression in the C source is f.a = 0;
, but note that the last token in the annotated token list is just f
with no children. The normal tokens look just fine and include identifiers for f
and a
in the assignment.
tu: TranslationUnit {
spelling: "test/simple.c",
}
diags: []
tokens: [
Token {
kind: Keyword,
spelling: "typedef",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 1,
column: 1,
offset: 0,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 1,
column: 8,
offset: 7,
},
},
},
Token {
kind: Keyword,
spelling: "struct",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 1,
column: 9,
offset: 8,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 1,
column: 15,
offset: 14,
},
},
},
Token {
kind: Punctuation,
spelling: "{",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 2,
column: 1,
offset: 15,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 2,
column: 2,
offset: 16,
},
},
},
Token {
kind: Keyword,
spelling: "char",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 3,
offset: 19,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 7,
offset: 23,
},
},
},
Token {
kind: Punctuation,
spelling: "*",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 8,
offset: 24,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 9,
offset: 25,
},
},
},
Token {
kind: Identifier,
spelling: "a",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 9,
offset: 25,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 10,
offset: 26,
},
},
},
Token {
kind: Punctuation,
spelling: ";",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 10,
offset: 26,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 11,
offset: 27,
},
},
},
Token {
kind: Punctuation,
spelling: "}",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 1,
offset: 28,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 2,
offset: 29,
},
},
},
Token {
kind: Identifier,
spelling: "foo",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 3,
offset: 30,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 6,
offset: 33,
},
},
},
Token {
kind: Punctuation,
spelling: ";",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 6,
offset: 33,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 7,
offset: 34,
},
},
},
Token {
kind: Keyword,
spelling: "int",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 1,
offset: 36,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 4,
offset: 39,
},
},
},
Token {
kind: Identifier,
spelling: "main",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 5,
offset: 40,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 9,
offset: 44,
},
},
},
Token {
kind: Punctuation,
spelling: "(",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 9,
offset: 44,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 10,
offset: 45,
},
},
},
Token {
kind: Punctuation,
spelling: ")",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 10,
offset: 45,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 11,
offset: 46,
},
},
},
Token {
kind: Punctuation,
spelling: "{",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 7,
column: 1,
offset: 47,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 7,
column: 2,
offset: 48,
},
},
},
Token {
kind: Identifier,
spelling: "foo",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 3,
offset: 51,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 6,
offset: 54,
},
},
},
Token {
kind: Identifier,
spelling: "f",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 7,
offset: 55,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 8,
offset: 56,
},
},
},
Token {
kind: Punctuation,
spelling: ";",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 8,
offset: 56,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 9,
offset: 57,
},
},
},
Token {
kind: Identifier,
spelling: "f",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 3,
offset: 60,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 4,
offset: 61,
},
},
},
Token {
kind: Punctuation,
spelling: ".",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 4,
offset: 61,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 5,
offset: 62,
},
},
},
Token {
kind: Identifier,
spelling: "a",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 5,
offset: 62,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 6,
offset: 63,
},
},
},
Token {
kind: Punctuation,
spelling: "=",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 7,
offset: 64,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 8,
offset: 65,
},
},
},
Token {
kind: Literal,
spelling: "0",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 9,
offset: 66,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 10,
offset: 67,
},
},
},
Token {
kind: Punctuation,
spelling: ";",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 10,
offset: 67,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 11,
offset: 68,
},
},
},
Token {
kind: Punctuation,
spelling: "}",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 10,
column: 1,
offset: 69,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 10,
column: 2,
offset: 70,
},
},
},
]
annotated: [
Some(
Entity {
kind: TypedefDecl,
display_name: Some(
"foo",
),
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 3,
offset: 30,
},
),
},
),
Some(
Entity {
kind: TypedefDecl,
display_name: Some(
"foo",
),
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 3,
offset: 30,
},
),
},
),
None,
None,
Some(
Entity {
kind: FieldDecl,
display_name: Some(
"a",
),
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 9,
offset: 25,
},
),
},
),
None,
None,
None,
Some(
Entity {
kind: StructDecl,
display_name: None,
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 1,
column: 9,
offset: 8,
},
),
},
),
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
Some(
Entity {
kind: TypeRef,
display_name: Some(
"foo",
),
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 3,
offset: 51,
},
),
},
),
None,
None,
Some(
Entity {
kind: CompoundStmt,
display_name: None,
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 7,
column: 1,
offset: 47,
},
),
},
),
Some(
Entity {
kind: DeclRefExpr,
display_name: Some(
"f",
),
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 3,
offset: 60,
},
),
},
),
]
children: Map {
iter: Iter(
[
Some(
Entity {
kind: TypedefDecl,
display_name: Some(
"foo",
),
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 3,
offset: 30,
},
),
},
),
Some(
Entity {
kind: TypedefDecl,
display_name: Some(
"foo",
),
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 3,
offset: 30,
},
),
},
),
None,
None,
Some(
Entity {
kind: FieldDecl,
display_name: Some(
"a",
),
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 9,
offset: 25,
},
),
},
),
None,
None,
None,
Some(
Entity {
kind: StructDecl,
display_name: None,
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 1,
column: 9,
offset: 8,
},
),
},
),
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
Some(
Entity {
kind: TypeRef,
display_name: Some(
"foo",
),
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 3,
offset: 51,
},
),
},
),
None,
None,
Some(
Entity {
kind: CompoundStmt,
display_name: None,
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 7,
column: 1,
offset: 47,
},
),
},
),
Some(
Entity {
kind: DeclRefExpr,
display_name: Some(
"f",
),
location: Some(
SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 3,
offset: 60,
},
),
},
),
],
),
}
The second crazy issue showed up when I made a simple change to the C file: I made the char *a;
member an integer instead. Now there are no diagnostics, but also no annotated tokens at all. The normal tokens look just fine and include identifiers for f
and a
in the assignment.
tu: TranslationUnit {
spelling: "test/simple.c",
}
diags: []
tokens: [
Token {
kind: Keyword,
spelling: "typedef",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 1,
column: 1,
offset: 0,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 1,
column: 8,
offset: 7,
},
},
},
Token {
kind: Keyword,
spelling: "struct",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 1,
column: 9,
offset: 8,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 1,
column: 15,
offset: 14,
},
},
},
Token {
kind: Punctuation,
spelling: "{",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 2,
column: 1,
offset: 15,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 2,
column: 2,
offset: 16,
},
},
},
Token {
kind: Keyword,
spelling: "int",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 3,
offset: 19,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 6,
offset: 22,
},
},
},
Token {
kind: Identifier,
spelling: "a",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 7,
offset: 23,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 8,
offset: 24,
},
},
},
Token {
kind: Punctuation,
spelling: ";",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 8,
offset: 24,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 3,
column: 9,
offset: 25,
},
},
},
Token {
kind: Punctuation,
spelling: "}",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 1,
offset: 26,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 2,
offset: 27,
},
},
},
Token {
kind: Identifier,
spelling: "foo",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 3,
offset: 28,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 6,
offset: 31,
},
},
},
Token {
kind: Punctuation,
spelling: ";",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 6,
offset: 31,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 4,
column: 7,
offset: 32,
},
},
},
Token {
kind: Keyword,
spelling: "int",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 1,
offset: 34,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 4,
offset: 37,
},
},
},
Token {
kind: Identifier,
spelling: "main",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 5,
offset: 38,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 9,
offset: 42,
},
},
},
Token {
kind: Punctuation,
spelling: "(",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 9,
offset: 42,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 10,
offset: 43,
},
},
},
Token {
kind: Punctuation,
spelling: ")",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 10,
offset: 43,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 6,
column: 11,
offset: 44,
},
},
},
Token {
kind: Punctuation,
spelling: "{",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 7,
column: 1,
offset: 45,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 7,
column: 2,
offset: 46,
},
},
},
Token {
kind: Identifier,
spelling: "foo",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 3,
offset: 49,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 6,
offset: 52,
},
},
},
Token {
kind: Identifier,
spelling: "f",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 7,
offset: 53,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 8,
offset: 54,
},
},
},
Token {
kind: Punctuation,
spelling: ";",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 8,
offset: 54,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 8,
column: 9,
offset: 55,
},
},
},
Token {
kind: Identifier,
spelling: "f",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 3,
offset: 58,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 4,
offset: 59,
},
},
},
Token {
kind: Punctuation,
spelling: ".",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 4,
offset: 59,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 5,
offset: 60,
},
},
},
Token {
kind: Identifier,
spelling: "a",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 5,
offset: 60,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 6,
offset: 61,
},
},
},
Token {
kind: Punctuation,
spelling: "=",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 7,
offset: 62,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 8,
offset: 63,
},
},
},
Token {
kind: Literal,
spelling: "0",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 9,
offset: 64,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 10,
offset: 65,
},
},
},
Token {
kind: Punctuation,
spelling: ";",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 10,
offset: 65,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 9,
column: 11,
offset: 66,
},
},
},
Token {
kind: Punctuation,
spelling: "}",
range: SourceRange {
start: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 10,
column: 1,
offset: 67,
},
end: SourceLocation {
file: Some(
File {
path: "test/simple.c",
},
),
line: 10,
column: 2,
offset: 68,
},
},
},
]
annotated: [
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
]
children: Map {
iter: Iter(
[
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
None,
],
),
}