debugVarTool/parse_xml/Src/parse_xml.py
Razvalyaev e99de603e6 попытка сделать в parse_xml парсинг вложенных массивов [][]
определяет вложенные массивы но не определяет их размерности (нули)
2025-07-22 18:57:59 +03:00

380 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# pyinstaller --onefile --distpath ./parse_xml --workpath ./build --specpath ./build parse_xml/Src/parse_xml.py
# python -m nuitka --standalone --onefile --output-dir=./build parse_xml/Src/parse_xml.py
import xml.etree.ElementTree as ET
import xml.dom.minidom
import sys
import os
if len(sys.argv) < 3:
print("Usage: python simplify_dwarf.py <input.xml> <info.txt> [output.xml]")
sys.exit(1)
input_path = sys.argv[1]
info_path = sys.argv[2]
base_type_sizes = {
"char": 2,
"short": 2,
"int": 2,
"long": 4,
"long long": 8,
"float": 4,
"double": 8,
}
if len(sys.argv) >= 4:
output_path = sys.argv[3]
else:
input_dir = os.path.dirname(os.path.abspath(input_path))
output_path = os.path.join(input_dir, "simplified.xml")
tree = ET.parse(input_path)
root = tree.getroot()
def extract_timestamp(info_path):
with open(info_path, "r", encoding="utf-8") as f:
for line in f:
if "Time Stamp:" in line:
parts = line.split("Time Stamp:")
if len(parts) > 1:
timestamp = parts[1].strip()
return timestamp
die_by_id = {die.attrib.get("id"): die for die in root.iter("die") if "id" in die.attrib}
def get_attr(die, attr_type):
for attr in die.findall("attribute"):
type_elem = attr.find("type")
if type_elem is not None and type_elem.text == attr_type:
return attr.find("value")
return None
def get_die_size(die):
"""Вернуть размер DIE в байтах из атрибута DW_AT_byte_size или по ключевым словам имени типа."""
# Сначала пытаемся получить размер из DW_AT_byte_size
for attr in die.findall("attribute"):
type_elem = attr.find("type")
if type_elem is not None and type_elem.text == "DW_AT_byte_size":
const_elem = attr.find("value/const")
if const_elem is not None:
return int(const_elem.text, 0)
# Если не нашли, пробуем определить размер по ключевым словам в имени типа
name_elem = die.find("attribute[@name='DW_AT_name']/value/const")
if name_elem is not None:
type_name = name_elem.text.lower()
for key, size in base_type_sizes.items():
if key in type_name:
return size
return None
def resolve_type_die(type_id):
"""Получить DIE типа, разрешая typedef, const и volatile."""
visited = set()
while type_id and type_id not in visited:
visited.add(type_id)
die = die_by_id.get(type_id)
if die is None:
return None
tag = die.findtext("tag")
if tag in ("DW_TAG_volatile_type", "DW_TAG_const_type", "DW_TAG_typedef", "DW_TAG_TI_far_type"):
ref = get_attr(die, "DW_AT_type")
if ref is not None and ref.find("ref") is not None:
type_id = ref.find("ref").attrib.get("idref")
else:
return None
else:
return die
return None
# Словарь для простых базовых типов по тегам (пример)
base_types_map = {
"DW_TAG_base_type": lambda die: die.find("attribute[@type='DW_AT_name']/value/string").text if die.find("attribute[@type='DW_AT_name']/value/string") is not None else "unknown",
"DW_TAG_structure_type": lambda die: "struct",
"DW_TAG_union_type": lambda die: "union",
"DW_TAG_pointer_type": lambda die: "pointer",
"DW_TAG_array_type": lambda die: "array",
}
def get_type_name(type_id):
die = resolve_type_die(type_id)
if die is None:
return "unknown"
tag = die.findtext("tag")
if tag == "DW_TAG_pointer_type":
ref = get_attr(die, "DW_AT_type")
if ref is not None and ref.find("ref") is not None:
pointee_id = ref.find("ref").attrib.get("idref")
name = get_type_name(pointee_id)
return name + "*" if name != "unknown" else name
else:
return "void*"
elif tag == "DW_TAG_base_type":
name_attr = get_attr(die, "DW_AT_name")
if name_attr is not None:
return name_attr.findtext("string")
else:
return "base_type_unknown"
elif tag == "DW_TAG_structure_type":
name_attr = get_attr(die, "DW_AT_name")
name = name_attr.findtext("string") if name_attr is not None else "anonymous_struct"
return f"struct {name}"
elif tag == "DW_TAG_union_type":
name_attr = get_attr(die, "DW_AT_name")
name = name_attr.findtext("string") if name_attr is not None else "anonymous_union"
return f"union {name}"
elif tag == "DW_TAG_array_type":
ref = get_attr(die, "DW_AT_type")
if ref is not None and ref.find("ref") is not None:
element_type_id = ref.find("ref").attrib.get("idref")
element_type_name = get_type_name(element_type_id)
return f"{element_type_name}[]"
else:
return "array[]"
# Добавляем поддержку enum
elif tag == "DW_TAG_enumeration_type":
name_attr = get_attr(die, "DW_AT_name")
name = name_attr.findtext("string") if name_attr is not None else "anonymous_enum"
return f"enum {name}"
else:
return "unknown"
def parse_offset(offset_text):
if offset_text and offset_text.startswith("DW_OP_plus_uconst "):
return int(offset_text.split()[-1], 0)
return 0
def get_array_dimensions(array_die):
"""
Собрать размеры всех измерений массива.
Возвращает список размеров [dim0, dim1, ...] во внешне-внутреннем порядке.
"""
dims = []
# Ищем DW_TAG_subrange_type — для каждого измерения
for child in array_die.findall("die"):
if child.findtext("tag") != "DW_TAG_subrange_type":
continue
dim_size = None
# Попытка получить upper_bound
ub_attr = get_attr(child, "DW_AT_upper_bound")
if ub_attr is not None:
val = ub_attr.find("value/const")
if val is not None:
try:
# В DWARF верхняя граница включительно, значит размер = upper_bound + 1
dim_size = int(val.text, 0) + 1
except Exception:
pass
# Если не получилось — попытаться из count
if dim_size is None:
ct_attr = get_attr(child, "DW_AT_count")
if ct_attr is not None:
val = ct_attr.find("value/const")
if val is not None:
try:
dim_size = int(val.text, 0)
except Exception:
pass
# Если ничего не найдено — ставим 0
if dim_size is None:
dim_size = 0
dims.append(dim_size)
# Если subrange не нашли (например, в случае typedef), попробуем через размер типа
if not dims:
arr_size = get_die_size(array_die)
elem_size = None
element_type_ref = get_attr(array_die, "DW_AT_type")
if element_type_ref is not None and element_type_ref.find("ref") is not None:
element_type_id = element_type_ref.find("ref").attrib.get("idref")
elem_die = resolve_type_die(element_type_id)
if elem_die is not None:
elem_size = get_die_size(elem_die)
if arr_size is not None and elem_size:
dims.append(arr_size // elem_size)
else:
dims.append(0)
# Рекурсия — если элементный тип массива тоже массив, добавляем размеры вложенного
element_type_ref = get_attr(array_die, "DW_AT_type")
if element_type_ref is not None and element_type_ref.find("ref") is not None:
element_type_id = element_type_ref.find("ref").attrib.get("idref")
element_type_die = resolve_type_die(element_type_id)
if element_type_die is not None and element_type_die.findtext("tag") == "DW_TAG_array_type":
dims.extend(get_array_dimensions(element_type_die))
return dims
def get_base_type_die(array_die):
"""Спускаемся по цепочке DW_AT_type, пока не дойдем до не-массива (базового типа)."""
current_die = array_die
while True:
ref = get_attr(current_die, "DW_AT_type")
if ref is None or ref.find("ref") is None:
break
next_die = resolve_type_die(ref.find("ref").attrib.get("idref"))
if next_die is None:
break
if next_die.findtext("tag") == "DW_TAG_array_type":
current_die = next_die
else:
return next_die
return current_die
def handle_array_type(member_elem, resolved_type, offset=0):
dims = get_array_dimensions(resolved_type)
# Определяем базовый тип (не массив)
base_die = get_base_type_die(resolved_type)
base_name = "unknown"
base_size = None
if base_die is not None:
base_id = base_die.attrib.get("id")
if base_id:
base_name = get_type_name(base_id)
base_size = get_die_size(base_die)
else:
base_name = get_type_name(base_die.attrib.get("id", ""))
member_elem.set("type", base_name + "[]" * len(dims))
# Вычисляем общий размер массива — произведение размеров * размер базового элемента
if base_size is None:
base_size = 0
total_elements = 1
for d in dims:
if d == 0:
total_elements = 0
break
total_elements *= d
total_size = total_elements * base_size if base_size is not None else 0
if total_size:
member_elem.set("size", str(total_size))
else:
# fallback: если не удалось, можно попробовать get_die_size
arr_size = get_die_size(resolved_type)
if arr_size:
member_elem.set("size", str(arr_size))
# Записываем размеры измерений size1, size2 ...
for i, dim in enumerate(dims, 1):
member_elem.set(f"size{i}", str(dim))
member_elem.set("kind", "array")
# Если элемент базового типа — структура, разворачиваем её поля
if base_die is not None and base_die.findtext("tag") == "DW_TAG_structure_type":
add_members_recursive(member_elem, base_die, offset)
def add_members_recursive(parent_elem, struct_die, base_offset=0):
is_union = struct_die.findtext("tag") == "DW_TAG_union_type"
size = get_die_size(struct_die)
if size is not None:
parent_elem.set("size", hex(size))
for member in struct_die.findall("die"):
if member.findtext("tag") != "DW_TAG_member":
continue
name_attr = get_attr(member, "DW_AT_name")
offset_attr = get_attr(member, "DW_AT_data_member_location")
type_attr = get_attr(member, "DW_AT_type")
if name_attr is None or offset_attr is None or type_attr is None:
continue
name = name_attr.findtext("string")
offset = parse_offset(offset_attr.findtext("block")) + base_offset
type_id = type_attr.find("ref").attrib.get("idref")
resolved_type = resolve_type_die(type_id)
type_name = get_type_name(type_id)
if type_name == "unknown":
continue
member_elem = ET.SubElement(parent_elem, "member", name=name, offset=hex(offset), type=type_name)
if is_union:
member_elem.set("kind", "union")
if resolved_type is not None:
tag = resolved_type.findtext("tag")
if tag == "DW_TAG_array_type":
handle_array_type(member_elem, resolved_type, offset)
elif tag in ("DW_TAG_structure_type", "DW_TAG_union_type"):
member_elem.set("type", type_name)
add_members_recursive(member_elem, resolved_type, offset)
output_root = ET.Element("variables")
for die in root.iter("die"):
if die.findtext("tag") != "DW_TAG_variable":
continue
name_attr = get_attr(die, "DW_AT_name")
addr_attr = get_attr(die, "DW_AT_location")
type_attr = get_attr(die, "DW_AT_type")
if name_attr is None or addr_attr is None or type_attr is None:
continue
name = name_attr.findtext("string")
if "$" in name:
continue
addr_text = addr_attr.findtext("block")
if not addr_text or not addr_text.startswith("DW_OP_addr "):
continue
addr = int(addr_text.split()[-1], 0)
type_id = type_attr.find("ref").attrib.get("idref")
resolved_type = resolve_type_die(type_id)
type_name = get_type_name(type_id)
if 0x800 <= addr < 0x8000 or type_name == "unknown":
continue
var_elem = ET.SubElement(output_root, "variable", name=name, address=hex(addr), type=type_name)
if resolved_type is not None:
tag = resolved_type.findtext("tag")
if tag == "DW_TAG_array_type":
handle_array_type(var_elem, resolved_type)
elif tag in ("DW_TAG_structure_type", "DW_TAG_union_type"):
add_members_recursive(var_elem, resolved_type)
timestamp = extract_timestamp(info_path)
timestamp_elem = ET.Element("timestamp")
timestamp_elem.text = timestamp
output_root.insert(0, timestamp_elem)
rough_string = ET.tostring(output_root, encoding="utf-8")
pretty_xml = xml.dom.minidom.parseString(rough_string).toprettyxml(indent=" ")
with open(output_path, "w", encoding="utf-8") as f:
f.write(pretty_xml)
os.remove(input_path)
os.remove(info_path)
print(f"Simplified and formatted XML saved to: {output_path}")