You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
193 lines
6.6 KiB
193 lines
6.6 KiB
.text |
|
.amdgcn_target "amdgcn-amd-amdhsa--gfx900" |
|
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr |
|
.protected _Z20vector_square_kernelIfEvPT_PKS0_y ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_y |
|
.globl _Z20vector_square_kernelIfEvPT_PKS0_y |
|
.p2align 8 |
|
.type _Z20vector_square_kernelIfEvPT_PKS0_y,@function |
|
_Z20vector_square_kernelIfEvPT_PKS0_y: ; @_Z20vector_square_kernelIfEvPT_PKS0_y |
|
; %bb.0: |
|
s_load_dword s0, s[4:5], 0x4 |
|
s_load_dwordx2 s[12:13], s[6:7], 0x10 |
|
v_mov_b32_e32 v1, 0 |
|
s_waitcnt lgkmcnt(0) |
|
s_and_b32 s0, s0, 0xffff |
|
s_mul_i32 s8, s8, s0 |
|
v_add_u32_e32 v0, s8, v0 |
|
v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1] |
|
s_and_saveexec_b64 s[0:1], vcc |
|
s_cbranch_execz .LBB0_3 |
|
; %bb.1: |
|
s_load_dword s14, s[4:5], 0xc |
|
s_load_dwordx4 s[8:11], s[6:7], 0x0 |
|
s_mov_b32 s15, 0 |
|
v_lshlrev_b64 v[2:3], 2, v[0:1] |
|
s_mov_b64 s[6:7], 0 |
|
s_waitcnt lgkmcnt(0) |
|
s_lshl_b64 s[4:5], s[14:15], 2 |
|
.LBB0_2: ; =>This Inner Loop Header: Depth=1 |
|
v_mov_b32_e32 v5, s11 |
|
v_add_co_u32_e32 v4, vcc, s10, v2 |
|
v_addc_co_u32_e32 v5, vcc, v5, v3, vcc |
|
global_load_dword v6, v[4:5], off |
|
v_mov_b32_e32 v5, s9 |
|
v_mov_b32_e32 v7, s15 |
|
v_add_co_u32_e32 v0, vcc, s14, v0 |
|
v_mov_b32_e32 v8, s5 |
|
v_add_co_u32_e64 v4, s[0:1], s8, v2 |
|
v_add_co_u32_e64 v2, s[2:3], s4, v2 |
|
v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1] |
|
v_addc_co_u32_e32 v1, vcc, v1, v7, vcc |
|
v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3] |
|
v_cmp_le_u64_e32 vcc, s[12:13], v[0:1] |
|
s_or_b64 s[6:7], vcc, s[6:7] |
|
s_waitcnt vmcnt(0) |
|
v_mul_f32_e32 v6, v6, v6 |
|
global_store_dword v[4:5], v6, off |
|
s_andn2_b64 exec, exec, s[6:7] |
|
s_cbranch_execnz .LBB0_2 |
|
.LBB0_3: |
|
s_endpgm |
|
.section .rodata,#alloc |
|
.p2align 6 |
|
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_y |
|
.amdhsa_group_segment_fixed_size 0 |
|
.amdhsa_private_segment_fixed_size 0 |
|
.amdhsa_kernarg_size 24 |
|
.amdhsa_user_sgpr_count 8 |
|
.amdhsa_user_sgpr_private_segment_buffer 1 |
|
.amdhsa_user_sgpr_dispatch_ptr 1 |
|
.amdhsa_user_sgpr_queue_ptr 0 |
|
.amdhsa_user_sgpr_kernarg_segment_ptr 1 |
|
.amdhsa_user_sgpr_dispatch_id 0 |
|
.amdhsa_user_sgpr_flat_scratch_init 0 |
|
.amdhsa_user_sgpr_private_segment_size 0 |
|
.amdhsa_system_sgpr_private_segment_wavefront_offset 0 |
|
.amdhsa_system_sgpr_workgroup_id_x 1 |
|
.amdhsa_system_sgpr_workgroup_id_y 0 |
|
.amdhsa_system_sgpr_workgroup_id_z 0 |
|
.amdhsa_system_sgpr_workgroup_info 0 |
|
.amdhsa_system_vgpr_workitem_id 0 |
|
.amdhsa_next_free_vgpr 9 |
|
.amdhsa_next_free_sgpr 16 |
|
.amdhsa_reserve_flat_scratch 0 |
|
.amdhsa_reserve_xnack_mask 1 |
|
.amdhsa_float_round_mode_32 0 |
|
.amdhsa_float_round_mode_16_64 0 |
|
.amdhsa_float_denorm_mode_32 3 |
|
.amdhsa_float_denorm_mode_16_64 3 |
|
.amdhsa_dx10_clamp 1 |
|
.amdhsa_ieee_mode 1 |
|
.amdhsa_fp16_overflow 0 |
|
.amdhsa_exception_fp_ieee_invalid_op 0 |
|
.amdhsa_exception_fp_denorm_src 0 |
|
.amdhsa_exception_fp_ieee_div_zero 0 |
|
.amdhsa_exception_fp_ieee_overflow 0 |
|
.amdhsa_exception_fp_ieee_underflow 0 |
|
.amdhsa_exception_fp_ieee_inexact 0 |
|
.amdhsa_exception_int_div_zero 0 |
|
.end_amdhsa_kernel |
|
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr |
|
.Lfunc_end0: |
|
.size _Z20vector_square_kernelIfEvPT_PKS0_y, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_y |
|
; -- End function |
|
.section .AMDGPU.csdata |
|
; Kernel info: |
|
; codeLenInByte = 216 |
|
; NumSgprs: 18 |
|
; NumVgprs: 9 |
|
; ScratchSize: 0 |
|
; MemoryBound: 0 |
|
; FloatMode: 240 |
|
; IeeeMode: 1 |
|
; LDSByteSize: 0 bytes/workgroup (compile time only) |
|
; SGPRBlocks: 2 |
|
; VGPRBlocks: 2 |
|
; NumSGPRsForWavesPerEU: 18 |
|
; NumVGPRsForWavesPerEU: 9 |
|
; Occupancy: 10 |
|
; WaveLimiterHint : 1 |
|
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 |
|
; COMPUTE_PGM_RSRC2:USER_SGPR: 8 |
|
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 |
|
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 |
|
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 |
|
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 |
|
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 |
|
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE |
|
.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object |
|
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc |
|
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE |
|
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: |
|
.zero 1 |
|
.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 |
|
|
|
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE |
|
.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object |
|
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc |
|
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE |
|
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: |
|
.zero 1 |
|
.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 |
|
|
|
.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE |
|
.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object |
|
.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc |
|
.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE |
|
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: |
|
.zero 1 |
|
.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 |
|
|
|
.protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE |
|
.type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object |
|
.section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc |
|
.weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE |
|
_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: |
|
.zero 1 |
|
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 |
|
|
|
.ident "AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)" |
|
.section ".note.GNU-stack" |
|
.addrsig |
|
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE |
|
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE |
|
.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE |
|
.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE |
|
.amdgpu_metadata |
|
--- |
|
amdhsa.kernels: |
|
- .args: |
|
- .address_space: global |
|
.offset: 0 |
|
.size: 8 |
|
.value_kind: global_buffer |
|
- .address_space: global |
|
.offset: 8 |
|
.size: 8 |
|
.value_kind: global_buffer |
|
- .offset: 16 |
|
.size: 8 |
|
.value_kind: by_value |
|
.group_segment_fixed_size: 0 |
|
.kernarg_segment_align: 8 |
|
.kernarg_segment_size: 24 |
|
.language: OpenCL C |
|
.language_version: |
|
- 2 |
|
- 0 |
|
.max_flat_workgroup_size: 1024 |
|
.name: _Z20vector_square_kernelIfEvPT_PKS0_y |
|
.private_segment_fixed_size: 0 |
|
.sgpr_count: 18 |
|
.sgpr_spill_count: 0 |
|
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_y.kd |
|
.vgpr_count: 9 |
|
.vgpr_spill_count: 0 |
|
.wavefront_size: 64 |
|
amdhsa.target: amdgcn-amd-amdhsa--gfx900 |
|
amdhsa.version: |
|
- 1 |
|
- 1 |
|
... |
|
|
|
.end_amdgpu_metadata
|
|
|