You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
195 lines
6.7 KiB
195 lines
6.7 KiB
.text |
|
.amdgcn_target "amdgcn-amd-amdhsa--gfx1030" |
|
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr |
|
.protected _Z20vector_square_kernelIfEvPT_PKS0_y ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_y |
|
.globl _Z20vector_square_kernelIfEvPT_PKS0_y |
|
.p2align 8 |
|
.type _Z20vector_square_kernelIfEvPT_PKS0_y,@function |
|
_Z20vector_square_kernelIfEvPT_PKS0_y: ; @_Z20vector_square_kernelIfEvPT_PKS0_y |
|
; %bb.0: |
|
s_load_dword s0, s[4:5], 0x4 |
|
s_load_dwordx2 s[2:3], s[6:7], 0x10 |
|
s_waitcnt lgkmcnt(0) |
|
s_and_b32 s0, s0, 0xffff |
|
v_mad_u64_u32 v[0:1], null, s8, s0, v[0:1] |
|
v_mov_b32_e32 v1, 0 |
|
s_mov_b32 s0, exec_lo |
|
v_cmpx_gt_u64_e64 s[2:3], v[0:1] |
|
s_cbranch_execz .LBB0_3 |
|
; %bb.1: |
|
s_load_dword s8, s[4:5], 0xc |
|
s_load_dwordx4 s[4:7], s[6:7], 0x0 |
|
v_lshlrev_b64 v[2:3], 2, v[0:1] |
|
s_mov_b32 s9, 0 |
|
s_waitcnt lgkmcnt(0) |
|
s_lshl_b64 s[10:11], s[8:9], 2 |
|
.p2align 6 |
|
.LBB0_2: ; =>This Inner Loop Header: Depth=1 |
|
v_add_co_u32 v4, vcc_lo, s6, v2 |
|
v_add_co_ci_u32_e32 v5, vcc_lo, s7, v3, vcc_lo |
|
v_add_co_u32 v0, vcc_lo, v0, s8 |
|
v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo |
|
global_load_dword v6, v[4:5], off |
|
v_add_co_u32 v4, vcc_lo, s4, v2 |
|
v_add_co_ci_u32_e32 v5, vcc_lo, s5, v3, vcc_lo |
|
v_cmp_le_u64_e32 vcc_lo, s[2:3], v[0:1] |
|
v_add_co_u32 v2, s0, v2, s10 |
|
v_add_co_ci_u32_e64 v3, s0, s11, v3, s0 |
|
s_or_b32 s9, vcc_lo, s9 |
|
s_waitcnt vmcnt(0) |
|
v_mul_f32_e32 v6, v6, v6 |
|
global_store_dword v[4:5], v6, off |
|
s_andn2_b32 exec_lo, exec_lo, s9 |
|
s_cbranch_execnz .LBB0_2 |
|
.LBB0_3: |
|
s_endpgm |
|
.section .rodata,#alloc |
|
.p2align 6 |
|
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_y |
|
.amdhsa_group_segment_fixed_size 0 |
|
.amdhsa_private_segment_fixed_size 0 |
|
.amdhsa_kernarg_size 24 |
|
.amdhsa_user_sgpr_count 8 |
|
.amdhsa_user_sgpr_private_segment_buffer 1 |
|
.amdhsa_user_sgpr_dispatch_ptr 1 |
|
.amdhsa_user_sgpr_queue_ptr 0 |
|
.amdhsa_user_sgpr_kernarg_segment_ptr 1 |
|
.amdhsa_user_sgpr_dispatch_id 0 |
|
.amdhsa_user_sgpr_flat_scratch_init 0 |
|
.amdhsa_user_sgpr_private_segment_size 0 |
|
.amdhsa_wavefront_size32 1 |
|
.amdhsa_system_sgpr_private_segment_wavefront_offset 0 |
|
.amdhsa_system_sgpr_workgroup_id_x 1 |
|
.amdhsa_system_sgpr_workgroup_id_y 0 |
|
.amdhsa_system_sgpr_workgroup_id_z 0 |
|
.amdhsa_system_sgpr_workgroup_info 0 |
|
.amdhsa_system_vgpr_workitem_id 0 |
|
.amdhsa_next_free_vgpr 7 |
|
.amdhsa_next_free_sgpr 12 |
|
.amdhsa_reserve_flat_scratch 0 |
|
.amdhsa_float_round_mode_32 0 |
|
.amdhsa_float_round_mode_16_64 0 |
|
.amdhsa_float_denorm_mode_32 3 |
|
.amdhsa_float_denorm_mode_16_64 3 |
|
.amdhsa_dx10_clamp 1 |
|
.amdhsa_ieee_mode 1 |
|
.amdhsa_fp16_overflow 0 |
|
.amdhsa_workgroup_processor_mode 1 |
|
.amdhsa_memory_ordered 1 |
|
.amdhsa_forward_progress 0 |
|
.amdhsa_shared_vgpr_count 0 |
|
.amdhsa_exception_fp_ieee_invalid_op 0 |
|
.amdhsa_exception_fp_denorm_src 0 |
|
.amdhsa_exception_fp_ieee_div_zero 0 |
|
.amdhsa_exception_fp_ieee_overflow 0 |
|
.amdhsa_exception_fp_ieee_underflow 0 |
|
.amdhsa_exception_fp_ieee_inexact 0 |
|
.amdhsa_exception_int_div_zero 0 |
|
.end_amdhsa_kernel |
|
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr |
|
.Lfunc_end0: |
|
.size _Z20vector_square_kernelIfEvPT_PKS0_y, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_y |
|
; -- End function |
|
.section .AMDGPU.csdata |
|
; Kernel info: |
|
; codeLenInByte = 212 |
|
; NumSgprs: 14 |
|
; NumVgprs: 7 |
|
; ScratchSize: 0 |
|
; MemoryBound: 0 |
|
; FloatMode: 240 |
|
; IeeeMode: 1 |
|
; LDSByteSize: 0 bytes/workgroup (compile time only) |
|
; SGPRBlocks: 1 |
|
; VGPRBlocks: 0 |
|
; NumSGPRsForWavesPerEU: 14 |
|
; NumVGPRsForWavesPerEU: 7 |
|
; Occupancy: 16 |
|
; WaveLimiterHint : 1 |
|
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 |
|
; COMPUTE_PGM_RSRC2:USER_SGPR: 8 |
|
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 |
|
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 |
|
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0 |
|
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0 |
|
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 |
|
.text |
|
.p2alignl 6, 3214868480 |
|
.fill 48, 4, 3214868480 |
|
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE |
|
.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object |
|
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc |
|
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE |
|
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE: |
|
.zero 1 |
|
.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1 |
|
|
|
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE |
|
.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object |
|
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc |
|
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE |
|
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE: |
|
.zero 1 |
|
.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1 |
|
|
|
.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE |
|
.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object |
|
.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc |
|
.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE |
|
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE: |
|
.zero 1 |
|
.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1 |
|
|
|
.protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE |
|
.type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object |
|
.section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc |
|
.weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE |
|
_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE: |
|
.zero 1 |
|
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1 |
|
|
|
.ident "AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)" |
|
.section ".note.GNU-stack" |
|
.addrsig |
|
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE |
|
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE |
|
.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE |
|
.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE |
|
.amdgpu_metadata |
|
--- |
|
amdhsa.kernels: |
|
- .args: |
|
- .address_space: global |
|
.offset: 0 |
|
.size: 8 |
|
.value_kind: global_buffer |
|
- .address_space: global |
|
.offset: 8 |
|
.size: 8 |
|
.value_kind: global_buffer |
|
- .offset: 16 |
|
.size: 8 |
|
.value_kind: by_value |
|
.group_segment_fixed_size: 0 |
|
.kernarg_segment_align: 8 |
|
.kernarg_segment_size: 24 |
|
.language: OpenCL C |
|
.language_version: |
|
- 2 |
|
- 0 |
|
.max_flat_workgroup_size: 1024 |
|
.name: _Z20vector_square_kernelIfEvPT_PKS0_y |
|
.private_segment_fixed_size: 0 |
|
.sgpr_count: 14 |
|
.sgpr_spill_count: 0 |
|
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_y.kd |
|
.vgpr_count: 7 |
|
.vgpr_spill_count: 0 |
|
.wavefront_size: 32 |
|
amdhsa.target: amdgcn-amd-amdhsa--gfx1030 |
|
amdhsa.version: |
|
- 1 |
|
- 1 |
|
... |
|
|
|
.end_amdgpu_metadata
|
|
|