#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <vector>
#include <string>
#define FLAG_POINTER 0x0001
#define FLAG_BITFIELD 0x0002
#define FLAG_UNNAMED 0x0004
#define FLAG_COMPOSITE 0x0008
#define FLAG_SIGNED 0x0010
#define FLAG_FLOAT 0x0020
#define FLAG_BIG_ENDIAN 0x0040
#define FLAG_UNION 0x0001
#define FLAG_UNUSABLE 0x4000
#define FLAG_UNRECOGNISED 0x8000
#define PADDING(offset, align) ((align) - ((offset) % (align))) % (align)
typedef unsigned char u8;
typedef unsigned long long u64;
struct Struct;
struct Field {
char *field_name;
char *type_name;
Struct *st;
int bit_offset;
int bit_size;
int default_bit_size;
int array_len;
int pointer_levels;
int flags;
};
struct Field_Vector {
Field *data = nullptr;
int pool_size = 8;
int n_fields = 0;
Field_Vector() {
data = new Field[pool_size];
}
~Field_Vector() {
delete[] data;
}
Field& back() {
return data[n_fields-1];
}
// increase pool size by a power of two each time a pool expansion is necessary
void expand() {
int old_size = pool_size;
pool_size *= 2;
Field *next = new Field[pool_size];
memcpy(next, data, old_size * sizeof(Field));
delete[] data;
data = next;
}
Field& add(Field&& f) {
if (n_fields >= pool_size)
expand();
data[n_fields++] = f;
return back();
}
Field& add(Field& f) {
if (n_fields >= pool_size)
expand();
data[n_fields++] = f;
return back();
}
void cancel_latest() {
if (n_fields <= 0)
return;
data[n_fields] = (struct Field){0};
n_fields--;
}
void zero_out() {
memset(data, 0, pool_size * sizeof(Field));
n_fields = 0;
}
};
struct Struct {
char *name;
int offset;
int total_size;
int longest_primitive;
int flags;
Field_Vector fields;
};
struct Primitive {
const char *name;
int bit_size;
int flags;
};
// A list of primitive types in the form of a vector. This lets us potentially add new primitives at run-time.
std::vector<Primitive> primitive_types = {
{"char", 8, FLAG_SIGNED},
{"int8_t", 8, FLAG_SIGNED},
{"uint8_t", 8},
{"short", 16, FLAG_SIGNED},
{"int16_t", 16, FLAG_SIGNED},
{"uint16_t", 16},
{"int", 32, FLAG_SIGNED},
{"int32_t", 32, FLAG_SIGNED},
{"uint32_t", 32},
{"long", 32, FLAG_SIGNED},
{"long long", 64, FLAG_SIGNED},
{"float", 32, FLAG_FLOAT},
{"double", 64, FLAG_FLOAT}
};
int pointer_size = 64; // bits
bool is_symbol(char c) {
return !(c >= '0' && c <= '9') &&
!(c >= 'A' && c <= 'Z') &&
!(c >= 'a' && c <= 'z') &&
c != '_';
}
char *add_token(char *tokens, char *word) {
int len = strlen(word);
if (len) {
strcpy(tokens, word);
tokens += len + 1;
}
return tokens;
}
void store_word(char **pool, char *word, char **ptr) {
if (word) {
int len = strlen(word);
if (len) {
strcpy(*pool, word);
*ptr = *pool;
*pool += len + 1;
}
}
}
void tokenize(char *tokens, char *text, int sz) {
char *out = tokens;
char *in = text;
char *word = text;
bool multi_comment = false;
bool line_comment = false;
bool was_symbol = true;
bool was_token_end = false;
char prev = 0;
for (int i = 0; i < sz; i++, in++) {
char c = *in;
if (multi_comment && prev == '*' && c == '/') {
multi_comment = false;
word = in+1;
prev = c;
continue;
}
if (line_comment && c == '\n') {
line_comment = false;
word = in;
}
if (c == '/' && i < sz-1) {
char next = in[1];
if (next == '*')
multi_comment = true;
if (next == '/')
line_comment = true;
}
if (multi_comment || line_comment) {
prev = c;
continue;
}
// We make an exception for the '<', '>' and ':' symbols from being split into their own tokens
// to allow for templated and/or namespaced types.
bool sandwichable = (c == '<' || c == '>' || c == ':');
if (c <= ' ' || c > '~') {
*in = 0;
was_token_end = true;
}
else if (
(is_symbol(c) && !sandwichable) ||
(was_symbol && sandwichable) ||
(c == ':' && i < sz-1 && i > 0 && (in[1] != ':' && in[-1] != ':'))
) {
*in = 0;
out = add_token(out, word);
*out++ = c;
if (c == '*') {
while (i < sz-1 && in[1] == '*') {
*out++ = '*';
in++;
i++;
}
}
word = in+1;
*out++ = 0;
was_symbol = true;
was_token_end = false;
}
else {
if (was_token_end) {
out = add_token(out, word);
word = in;
}
was_symbol = false;
was_token_end = false;
}
prev = c;
}
if (was_token_end)
out = add_token(out, word);
*out++ = 0;
}
// TODO: Expand upon this function to allow for constant variable lookup
u64 evaluate_number(char *token) {
return strtoull(token, nullptr, 0);
}
void embed_struct(Struct *master, Struct *embed) {
Field& group = master->fields.back();
if (group.flags & FLAG_POINTER) {
master->offset += PADDING(master->offset, pointer_size);
group.bit_offset = master->offset;
group.bit_size = pointer_size;
master->offset += group.bit_size;
return;
}
group.bit_offset = master->offset;
group.bit_size = embed->total_size;
int align = embed->longest_primitive;
master->offset += PADDING(master->offset, align);
for (int i = 0; i < embed->fields.n_fields; i++) {
Field *f = &master->fields.add(embed->fields.data[i]);
f->bit_offset += master->offset;
}
master->offset += embed->total_size;
}
// When either a struct/union is defined or referenced inside another struct, a declaration list may follow.
// This function finds each element in the declaration list and embeds it into the current struct.
void add_struct_instances(Struct *current_st, Struct *embed, char **tokens, char **name_head) {
Field *f = ¤t_st->fields.back();
char *t = *tokens;
bool first = true;
while (*t) {
if (!first && *t == ';')
break;
first = false;
if (is_symbol(*t) && *t != ';') {
if (*t == '*')
f->flags |= FLAG_POINTER;
t += strlen(t) + 1;
continue;
}
f->flags |= FLAG_COMPOSITE;
f->st = nullptr;
f->type_name = embed->name;
if (!f->type_name)
f->st = embed;
if (*t == ';') {
embed_struct(current_st, embed);
current_st->fields.add({0});
break;
}
store_word(name_head, t, &f->field_name);
embed_struct(current_st, embed);
f = ¤t_st->fields.add({0});
t += strlen(t) + 1;
}
*tokens = t;
}
Struct *lookup_struct(std::vector<Struct*>& structs, Struct *st, Field *f, char *str) {
if (!strcmp(str, "void") || !strcmp(str, "struct") || !strcmp(str, "union"))
return nullptr;
Struct *record = nullptr;
for (auto& c : structs) {
if (!c->name || strcmp(c->name, str))
continue;
record = c;
break;
}
if (!record) {
f->flags |= FLAG_UNRECOGNISED;
return nullptr;
}
// If the requested struct/union is the same as the current one, that would make it grow infinitely, so we do not allow that.
if (!strcmp(record->name, st->name)) {
f->flags |= FLAG_UNUSABLE;
st->flags |= FLAG_UNUSABLE;
}
if (record->flags & FLAG_UNUSABLE)
f->flags |= FLAG_UNUSABLE;
if (f->flags & FLAG_UNUSABLE)
return nullptr;
return record;
}
void parse_c_struct(std::vector<Struct*>& structs, char **tokens, char **name_buf, bool top_level = true) {
char *name_head = *name_buf;
std::string type;
char *t = *tokens;
char *prev = nullptr;
Struct *st;
if (!structs.size()) {
st = new Struct();
structs.push_back(st);
}
else
st = structs.back();
Field *f = nullptr;
int field_flags = 0;
bool was_symbol = false;
bool was_array_open = false;
bool was_zero_width = false;
bool named_field = false;
char *held_type = nullptr;
// The longest primitive in a struct/union is used to determine its byte alignment.
// The longest field in a union is the size of the union.
st->longest_primitive = 8;
int longest_field = 8;
while (*t) {
char *next = t + strlen(t) + 1;
bool type_over = is_symbol(*next) && *next != '*';
if (!f) {
f = &st->fields.add({0});
f->flags = field_flags;
named_field = false;
}
// Inline struct/union declaration
if (*t == '{' && type.size() > 0) {
char *name = named_field ? prev : nullptr;
if (!top_level) {
bool is_union = !strcmp(type.c_str(), "union");
Struct *new_st = new Struct();
new_st->flags |= is_union ? FLAG_UNION : 0;
store_word(&name_head, name, &new_st->name);
structs.push_back(new_st);
t += strlen(t) + 1;
parse_c_struct(structs, &t, &name_head, false);
add_struct_instances(st, new_st, &t, &name_head);
next = t + strlen(t) + 1;
f = &st->fields.back();
}
else {
store_word(&name_head, name, &st->name);
top_level = false;
}
held_type = nullptr;
was_symbol = true;
type = "";
}
else if (is_symbol(*t) && *t != '*') {
// Bitfield
if (!strcmp(t, ":")) { // the bitfield symbol ':' must have a length of 1
int n = (int)evaluate_number(next);
if (n >= 0) {
if (n == 0) {
f->type_name = f->field_name = nullptr;
was_zero_width = true;
}
f->bit_size = n;
f->flags |= FLAG_BITFIELD;
if (!named_field || n == 0)
f->flags |= FLAG_UNNAMED;
}
}
// Set field name and type name
if (!was_symbol) {
if (!f->type_name)
store_word(&name_head, (char*)type.c_str(), &f->type_name);
if (!f->field_name && (f->flags & FLAG_UNNAMED) == 0)
store_word(&name_head, prev, &f->field_name);
}
// Calculate array length. Each successive array dimension is simply multiplied with the current array length.
if (*t == '[') {
int n = (int)evaluate_number(next);
if (n > 0) {
if (f->array_len > 0)
f->array_len *= n;
else
f->array_len = n;
}
}
// End of the current field
if (*t == ';' || *t == ',') {
if (f->flags & FLAG_POINTER) {
f->bit_size = pointer_size;
f->default_bit_size = pointer_size;
}
// Determine padding before the current field starts
int align = f->default_bit_size;
if (align > 0 && (was_zero_width || (f->flags & FLAG_BITFIELD) == 0)) {
if ((st->flags & FLAG_UNION) == 0)
st->offset += PADDING(st->offset, align);
if (align > st->longest_primitive)
st->longest_primitive = align;
was_zero_width = false;
}
if (f->array_len > 0)
f->bit_size *= f->array_len;
if (f->bit_size > longest_field)
longest_field = f->bit_size;
// A handy definition of the C union
f->bit_offset = st->offset;
if ((st->flags & FLAG_UNION) == 0)
st->offset += f->bit_size;
// Most zero-width fields are later overwritten.
// By setting the current field to NULL, we know that a new field must be created later.
if (!was_zero_width && (f->type_name || f->field_name)) {
field_flags = *t == ',' ? f->flags : 0;
held_type = f->type_name;
f = nullptr;
}
}
if (*t == ';') {
held_type = nullptr;
type = "";
}
was_symbol = true;
}
else {
if (*t == '*') {
f->flags |= FLAG_POINTER;
f->pointer_levels = strlen(t);
}
else if (held_type) {
named_field = true;
}
else {
// Determine if this field has a name (unnamed fields can be legal, eg. zero-width bitfields)
if (type_over) {
named_field = true;
for (auto& p : primitive_types) {
if (!strcmp(t, p.name)) {
named_field = false;
break;
}
}
if (named_field)
named_field = strcmp(t, "struct") && strcmp(t, "union");
}
// Ignore "const" when determining the field type
if ((!type_over && strcmp(t, "const")) || (type_over && !named_field)) {
if (type.size() > 0)
type += ' ';
type += t;
}
}
was_symbol = false;
}
if (held_type)
type = held_type;
// Determine the type of the current field.
// This code may run multiple times per field,
// as there some primitive types that use more than one keyword (eg. "long long")
if (f && type_over && type.size() > 0 && (f->flags & FLAG_COMPOSITE) == 0) {
const char *str = type.c_str();
bool found_primitive = false;
for (auto& p : primitive_types) {
if (!strcmp(str, p.name)) {
f->flags |= p.flags;
f->default_bit_size = p.bit_size;
if ((f->flags & FLAG_BITFIELD) == 0 || f->bit_size > f->default_bit_size)
f->bit_size = f->default_bit_size;
found_primitive = true;
break;
}
}
// Check if the type refers to an existing struct/union
if (!found_primitive) {
size_t last_space = type.find_last_of(' ');
std::string type_name = last_space == std::string::npos ? type : type.substr(last_space + 1);
// If so, then add its following declaration list to this struct
Struct *record = lookup_struct(structs, st, f, (char*)type_name.c_str());
if (record) {
add_struct_instances(st, record, &t, &name_head);
next = t + strlen(t) + 1;
f = &st->fields.back();
held_type = nullptr;
was_symbol = true;
type = "";
}
}
}
prev = t;
t = next;
if (*prev == '}')
break;
}
// Pad the struct to the next appropriate boundary
if (st->flags & FLAG_UNION)
st->total_size = longest_field + PADDING(longest_field, st->longest_primitive);
else
st->total_size = st->offset + PADDING(st->offset, st->longest_primitive);
// Cancel any remaining half-processed field
if (f)
st->fields.cancel_latest();
*name_buf = name_head;
*tokens = t;
}
int main(int argc, char **argv) {
if (argc < 2) {
printf("Invalid arguments\n"
"Usage: %s <text file with struct>\n", argv[0]);
return 1;
}
FILE *f = fopen(argv[1], "rb");
if (!f) {
printf("Could not open %s\n", argv[1]);
return 2;
}
fseek(f, 0, SEEK_END);
int sz = ftell(f);
rewind(f);
if (!sz)
return 3;
char *buf = (char*)malloc(sz + 1);
fread(buf, 1, sz, f);
buf[sz] = 0;
fclose(f);
char *tokens = (char*)malloc(sz * 2);
tokenize(tokens, buf, sz);
free(buf);
std::vector<Struct*> structs;
char *name_buf = (char*)malloc(sz * 2);
char *name_buf_alias = name_buf;
char *tokens_alias = tokens;
parse_c_struct(structs, &tokens_alias, &name_buf_alias);
Struct *st = structs[0];
printf("%s:\n", st->name);
char array_buf[32];
for (int i = 0; i < st->fields.n_fields; i++) {
Field& f = st->fields.data[i];
const char *is_ptr = (f.flags & FLAG_POINTER) ? "pointer " : nullptr;
char *is_array = nullptr;
if (f.array_len > 0) {
snprintf(array_buf, 32, "array[%d] ", f.array_len);
is_array = array_buf;
}
printf("%04d : %04d | %4x | %s : %s", f.bit_offset, f.bit_size, f.flags, f.field_name, f.type_name);
if (is_ptr || is_array) {
for (int j = 0; j < f.pointer_levels; j++)
putchar('*');
printf(" | %s%s\n", is_ptr ? is_ptr : "", is_array ? is_array : "");
}
else
putchar('\n');
}
printf("\nTotal size: %d (%d bits)\n", st->total_size / 8, st->total_size);
free(tokens);
free(name_buf);
return 0;
}