From 8561e4e2903ae0b4aff1ed1dc9e9871c89df6b43 Mon Sep 17 00:00:00 2001 From: Tobias Burnus Date: Fri, 7 Feb 2025 13:20:25 +0100 Subject: [PATCH] [GCN] Handle generic ISA names in libgomp's plugin-gcn.c libgomp/ChangeLog: * plugin/plugin-gcn.c (ELFABIVERSION_AMDGPU_HSA_V6, EF_AMDGPU_GENERIC_VERSION_V, EF_AMDGPU_GENERIC_VERSION_OFFSET, GET_GENERIC_VERSION): New #define. (elf_gcn_isa_is_generic): New. (isa_matches_agent): Accept all generic code objects on the first go; extend the diagnostic and handle runtime-failed case. (create_and_finalize_hsa_program): Call it also after loading the code failed, pass the status. --- libgomp/plugin/plugin-gcn.c | 120 ++++++++++++++++++++++++++++-------- 1 file changed, 93 insertions(+), 27 deletions(-) diff --git a/libgomp/plugin/plugin-gcn.c b/libgomp/plugin/plugin-gcn.c index 8015a6f80f3..5c65778191a 100644 --- a/libgomp/plugin/plugin-gcn.c +++ b/libgomp/plugin/plugin-gcn.c @@ -66,6 +66,14 @@ #define R_AMDGPU_RELATIVE64 13 /* B + A */ #endif +#define ELFABIVERSION_AMDGPU_HSA_V6 4 + +#define EF_AMDGPU_GENERIC_VERSION_V 0xff000000 /* Mask. */ +#define EF_AMDGPU_GENERIC_VERSION_OFFSET 24 + +#define GET_GENERIC_VERSION(VAR) ((VAR & EF_AMDGPU_GENERIC_VERSION_V) \ + >> EF_AMDGPU_GENERIC_VERSION_OFFSET) + /* GCN specific definitions for asynchronous queues. */ #define ASYNC_QUEUE_SIZE 64 @@ -242,7 +250,7 @@ struct kernel_dispatch }; /* Structure of the kernargs segment, supporting console output. - + This needs to match the definitions in Newlib, and the expectations in libgomp target code. */ @@ -1668,6 +1676,13 @@ elf_gcn_isa_field (Elf64_Ehdr *image) return image->e_flags & EF_AMDGPU_MACH_MASK; } +static int +elf_gcn_isa_is_generic (Elf64_Ehdr *image) +{ + return (image->e_ident[8] == ELFABIVERSION_AMDGPU_HSA_V6 + && GET_GENERIC_VERSION (image->e_flags)); +} + /* Returns the name that the HSA runtime uses for the ISA or NULL if we do not support the ISA. */ @@ -2399,38 +2414,88 @@ init_basic_kernel_info (struct kernel_info *kernel, return true; } -/* Check that the GCN ISA of the given image matches the ISA of the agent. */ +/* If status is SUCCESS, assume that the code runs if either the ISA of agent + and code is the same - or it is generic code. + Otherwise, execution failed with the provided status code; try to give + some useful diagnostic. */ static bool -isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image) +isa_matches_agent (struct agent_info *agent, Elf64_Ehdr *image, + hsa_status_t status) { + /* Generic image - assume that it works and only return to here + when it fails, i.e. fatal == true. */ + if (status == HSA_STATUS_SUCCESS && elf_gcn_isa_is_generic (image)) + return true; + int isa_field = elf_gcn_isa_field (image); - const char* isa_s = isa_name (isa_field); - if (!isa_s) + if (status == HSA_STATUS_SUCCESS && isa_field == agent->device_isa) + return true; + + /* If we get here, either the binary is non-generic and has a mismatch of + the ISA - or is generic but not handled by the ROCm (e.g. because ROCm + is too old). */ + + char msg[340]; + char agent_isa_xs[8]; + char device_isa_xs[8]; + const char *agent_isa_s = isa_name (agent->device_isa); + const char *device_isa_s = isa_name (isa_field); + if (agent_isa_s == NULL) { - hsa_error ("Unsupported ISA in GCN code object.", HSA_STATUS_ERROR); - return false; + snprintf (agent_isa_xs, sizeof agent_isa_xs, + "0x%X", agent->device_isa); + agent_isa_s = agent_isa_xs; + } + if (device_isa_s == NULL) + { + snprintf (device_isa_xs, sizeof device_isa_xs, "0x%X", isa_field); + device_isa_s = device_isa_xs; } - if (isa_field != agent->device_isa) - { - char msg[204]; - const char *agent_isa_s = isa_name (agent->device_isa); - assert (agent_isa_s); + /* Some error which should be unrelated to the ISA. */ + if (status != HSA_STATUS_SUCCESS + && status != HSA_STATUS_ERROR_INVALID_CODE_OBJECT + && status != HSA_STATUS_ERROR_INVALID_ISA_NAME + && status != HSA_STATUS_ERROR_INCOMPATIBLE_ARGUMENTS) + snprintf (msg, sizeof msg, + "Could not load GCN code object with ISA %s on GPU with " + "ISA %s (device %d).\n" + "Consider using ROCR_VISIBLE_DEVICES to disable incompatible " + "devices or run with LOADER_ENABLE_LOGGING=1 for more details.", + device_isa_s, agent_isa_s, agent->device_id); + else if (status == HSA_STATUS_ERROR_INVALID_ISA_NAME + && elf_gcn_isa_is_generic (image)) + snprintf (msg, sizeof msg, + "Unsupported generic ISA %s on GPU with ISA %s (device %d).\n" + "%s%s%s" + "Consider using ROCR_VISIBLE_DEVICES to disable incompatible " + "devices, run with LOADER_ENABLE_LOGGING=1 for more details, " + "or try updating to a ROCm that supports this generic ISA.", + device_isa_s, agent_isa_s, agent->device_id, + agent_isa_s[0] != '0' + ? "Try to recompile with '-foffload-options=-march=" : "", + agent_isa_s[0] != '0' ? agent_isa_s : "", + agent_isa_s[0] != '0' ? ".\n" : ""); + else if (agent_isa_s[0] == '0') + snprintf (msg, sizeof msg, + "GCN code object ISA '%s' is incompatible with GPU ISA '%s' " + "(device %d).\n" + "Consider using ROCR_VISIBLE_DEVICES to disable incompatible " + "devices or run with LOADER_ENABLE_LOGGING=1 for more details.", + device_isa_s, agent_isa_s, agent->device_id); + else + snprintf (msg, sizeof msg, + "GCN code object ISA '%s' is incompatible with GPU ISA '%s' " + "(device %d).\n" + "Try to recompile with '-foffload-options=-march=%s',\n" + "or use ROCR_VISIBLE_DEVICES to disable incompatible " + "devices.\n", + device_isa_s, agent_isa_s, agent->device_id, agent_isa_s); - snprintf (msg, sizeof msg, - "GCN code object ISA '%s' does not match GPU ISA '%s' " - "(device %d).\n" - "Try to recompile with '-foffload-options=-march=%s',\n" - "or use ROCR_VISIBLE_DEVICES to disable incompatible " - "devices.\n", - isa_s, agent_isa_s, agent->device_id, agent_isa_s); - - hsa_error (msg, HSA_STATUS_ERROR); - return false; - } - - return true; + hsa_error (msg, status != HSA_STATUS_SUCCESS + ? status : HSA_STATUS_ERROR_INVALID_CODE_OBJECT); + return false; } /* Create and finalize the program consisting of all loaded modules. */ @@ -2464,7 +2529,8 @@ create_and_finalize_hsa_program (struct agent_info *agent) { Elf64_Ehdr *image = (Elf64_Ehdr *)module->image_desc->gcn_image->image; - if (!isa_matches_agent (agent, image)) + /* Check the ISA early because older ROCm had unhelpful errors. */ + if (!isa_matches_agent (agent, image, HSA_STATUS_SUCCESS)) goto fail; hsa_code_object_t co = { 0 }; @@ -2482,7 +2548,7 @@ create_and_finalize_hsa_program (struct agent_info *agent) (agent->executable, agent->id, co, ""); if (status != HSA_STATUS_SUCCESS) { - hsa_error ("Could not load GCN code object", status); + isa_matches_agent (agent, image, status); goto fail; }