You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

193 lines
6.6 KiB

.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx900"
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.protected _Z20vector_square_kernelIfEvPT_PKS0_y ; -- Begin function _Z20vector_square_kernelIfEvPT_PKS0_y
.globl _Z20vector_square_kernelIfEvPT_PKS0_y
.p2align 8
.type _Z20vector_square_kernelIfEvPT_PKS0_y,@function
_Z20vector_square_kernelIfEvPT_PKS0_y: ; @_Z20vector_square_kernelIfEvPT_PKS0_y
; %bb.0:
s_load_dword s0, s[4:5], 0x4
s_load_dwordx2 s[12:13], s[6:7], 0x10
v_mov_b32_e32 v1, 0
s_waitcnt lgkmcnt(0)
s_and_b32 s0, s0, 0xffff
s_mul_i32 s8, s8, s0
v_add_u32_e32 v0, s8, v0
v_cmp_gt_u64_e32 vcc, s[12:13], v[0:1]
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_3
; %bb.1:
s_load_dword s14, s[4:5], 0xc
s_load_dwordx4 s[8:11], s[6:7], 0x0
s_mov_b32 s15, 0
v_lshlrev_b64 v[2:3], 2, v[0:1]
s_mov_b64 s[6:7], 0
s_waitcnt lgkmcnt(0)
s_lshl_b64 s[4:5], s[14:15], 2
.LBB0_2: ; =>This Inner Loop Header: Depth=1
v_mov_b32_e32 v5, s11
v_add_co_u32_e32 v4, vcc, s10, v2
v_addc_co_u32_e32 v5, vcc, v5, v3, vcc
global_load_dword v6, v[4:5], off
v_mov_b32_e32 v5, s9
v_mov_b32_e32 v7, s15
v_add_co_u32_e32 v0, vcc, s14, v0
v_mov_b32_e32 v8, s5
v_add_co_u32_e64 v4, s[0:1], s8, v2
v_add_co_u32_e64 v2, s[2:3], s4, v2
v_addc_co_u32_e64 v5, s[0:1], v5, v3, s[0:1]
v_addc_co_u32_e32 v1, vcc, v1, v7, vcc
v_addc_co_u32_e64 v3, vcc, v3, v8, s[2:3]
v_cmp_le_u64_e32 vcc, s[12:13], v[0:1]
s_or_b64 s[6:7], vcc, s[6:7]
s_waitcnt vmcnt(0)
v_mul_f32_e32 v6, v6, v6
global_store_dword v[4:5], v6, off
s_andn2_b64 exec, exec, s[6:7]
s_cbranch_execnz .LBB0_2
.LBB0_3:
s_endpgm
.section .rodata,#alloc
.p2align 6
.amdhsa_kernel _Z20vector_square_kernelIfEvPT_PKS0_y
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 24
.amdhsa_user_sgpr_count 8
.amdhsa_user_sgpr_private_segment_buffer 1
.amdhsa_user_sgpr_dispatch_ptr 1
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_flat_scratch_init 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_system_sgpr_private_segment_wavefront_offset 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 0
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 9
.amdhsa_next_free_sgpr 16
.amdhsa_reserve_flat_scratch 0
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.section .text._Z20vector_square_kernelIfEvPT_PKS0_y,#alloc,#execinstr
.Lfunc_end0:
.size _Z20vector_square_kernelIfEvPT_PKS0_y, .Lfunc_end0-_Z20vector_square_kernelIfEvPT_PKS0_y
; -- End function
.section .AMDGPU.csdata
; Kernel info:
; codeLenInByte = 216
; NumSgprs: 18
; NumVgprs: 9
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 2
; VGPRBlocks: 2
; NumSGPRsForWavesPerEU: 18
; NumVGPRsForWavesPerEU: 9
; Occupancy: 10
; WaveLimiterHint : 1
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 8
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
.type _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,@object
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE,#alloc
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE, 1
.protected _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE ; @_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
.type _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,@object
.section .rodata._ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE,#alloc
.weak _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
_ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE, 1
.protected _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE ; @_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
.type _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,@object
.section .rodata._ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE,#alloc
.weak _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
_ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE, 1
.protected _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE ; @_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
.type _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,@object
.section .rodata._ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE,#alloc
.weak _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
_ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE:
.zero 1
.size _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE, 1
.ident "AMD clang version 15.0.0 (https://github.com/RadeonOpenCompute/llvm-project roc-5.3.0 22362 3cf23f77f8208174a2ee7c616f4be23674d7b081)"
.section ".note.GNU-stack"
.addrsig
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockIdxE1xE
.addrsig_sym _ZN17__HIP_CoordinatesI14__HIP_BlockDimE1xE
.addrsig_sym _ZN17__HIP_CoordinatesI15__HIP_ThreadIdxE1xE
.addrsig_sym _ZN17__HIP_CoordinatesI13__HIP_GridDimE1xE
.amdgpu_metadata
---
amdhsa.kernels:
- .args:
- .address_space: global
.offset: 0
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 8
.size: 8
.value_kind: global_buffer
- .offset: 16
.size: 8
.value_kind: by_value
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 24
.language: OpenCL C
.language_version:
- 2
- 0
.max_flat_workgroup_size: 1024
.name: _Z20vector_square_kernelIfEvPT_PKS0_y
.private_segment_fixed_size: 0
.sgpr_count: 18
.sgpr_spill_count: 0
.symbol: _Z20vector_square_kernelIfEvPT_PKS0_y.kd
.vgpr_count: 9
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx900
amdhsa.version:
- 1
- 1
...
.end_amdgpu_metadata