当前位置: 首页 > news >正文

wordpress是什么东西搜索引擎优化关键字

wordpress是什么东西,搜索引擎优化关键字,一级服务器二级服务器,iis做网站主目录选哪里RTX3060 FP64测试与猜想 一.小结二.查看FP64的峰值性能三.打满FP64、FP32的利用率,对比差异四.进一步证明pipe_fp64_cycles_active并不是2个fp64 core的metrics RTX3060 FP64测试与猜想 一.小结 RTX3060 compute capability为8.6,每个SM有2个FP64 core。每个cycle可输出2个fp…

RTX3060 FP64测试与猜想

  • 一.小结
  • 二.查看FP64的峰值性能
  • 三.打满FP64、FP32的利用率,对比差异
  • 四.进一步证明pipe_fp64_cycles_active并不是2个fp64 core的metrics

RTX3060 FP64测试与猜想

一.小结

  • RTX3060 compute capability为8.6,每个SM有2个FP64 core。每个cycle可输出2个fp64的结果
  • RTX3060 有4个subcore,这2个core怎么给4个sub_core分呢
  • 执行FP64 DADD指令时,MIO PQ利用率超20%(FADD指令不存在该现象),且fp64 pipe的利用率最多为84%
  • 每个smsp 执行一条DADD warp指令 pipe_fp64_cycles_active 增加16个cycle,4个smsp一起运行一条DADD warp指令仍是16个cycle
  • 猜测:
    • smsp按 1DADD/cycle 交替发送给2个FP64 core,一个warp需要16个cycle(32inst/16cycle->2inst/cycle)
    • 如果4个sub core同时按这个速度发,则超过了FP64的处理能力(8inst/cycle > 2inst/cycle),但pipe_fp64_cycles_active没有增加
    • 说明,在发射FP64指令之前会检测资源的可用性,如果不足,则不发射,pipe_fp64_cycles_active也就不会增加
    • 也就解释了4个sub core一起执行时,pipe_fp64_cycles_active.max还是16个cycle
    • 执行FP64指令时,4个subcore通过MIO共享FP64实际的执行单元

二.查看FP64的峰值性能

tee fp64_peak_sustained.cu<<-'EOF'
#include <cuda_runtime.h>
#include <cuda.h>
__global__ void fake_kernel(){}
int main(int argc,char *argv[])
{fake_kernel<<<1, 1>>>();cudaDeviceSynchronize();
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -arch=sm_86 -lineinfo  -o fp64_peak_sustained fp64_peak_sustained.cu \-I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics \
sm__sass_thread_inst_executed_op_fp64_pred_on.avg.peak_sustained,\
sm__sass_thread_inst_executed_op_fp64_pred_on.sum.peak_sustained ./fp64_peak_sustained

输出

fake_kernel() (1, 1, 1)x(1, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
---------------------------------------------------------------- ----------- ------------
Metric Name                                                      Metric Unit Metric Value
---------------------------------------------------------------- ----------- ------------
sm__sass_thread_inst_executed_op_fp64_pred_on.avg.peak_sustained  inst/cycle            2  #每个sm的峰值性能
sm__sass_thread_inst_executed_op_fp64_pred_on.sum.peak_sustained  inst/cycle           56  #28个sm
---------------------------------------------------------------- ----------- ------------
  • 2 FP64 cores in devices of compute capability 8.6, 8.7 and 8.9
  • 问题:这2个core怎么给4个sub_core分呢?

三.打满FP64、FP32的利用率,对比差异

tee fp64_test.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>__global__ void kernel_add_float(volatile float *input,volatile float *output)
{unsigned int tid  = threadIdx.x + blockIdx.x * blockDim.x;float l=input[tid];float r=output[tid];for(int i=0;i<256;i++){l-=r;} input[tid]=l;
}
__global__ void kernel_add_double(volatile double *input,volatile double *output)
{unsigned int tid  = threadIdx.x + blockIdx.x * blockDim.x;double left=input[tid];double right=output[tid];for(int i=0;i<256;i++){left+=right;}       output[tid]=left;
}
EOF/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx fp64_test.cu -o fp64_test.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.ptx -cubin -o fp64_test.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
cat fp64_test.ptx
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin# 删掉除DADD、FADD以外的指令
cuasm.py fp64_test.cubin fp64_test.cuasm
sed '/MOV/d' -i fp64_test.cuasm
sed '/S2R/d' -i fp64_test.cuasm
sed '/ULDC/d' -i fp64_test.cuasm
sed '/IMAD/d' -i fp64_test.cuasm
sed '/LDG/d' -i fp64_test.cuasm
sed '/STG/d' -i fp64_test.cuasm
sed '/F2F/d' -i fp64_test.cuasmcuasm.py fp64_test.cuasm
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-resource-usage fp64_test.fatbintee fp64_test_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>int main(int argc,char *argv[])
{CUresult error;CUdevice cuDevice;cuInit(0);int deviceCount = 0;error = cuDeviceGetCount(&deviceCount);error = cuDeviceGet(&cuDevice, 0);if(error!=CUDA_SUCCESS){printf("Error happened in get device!\n");}CUcontext cuContext;error = cuCtxCreate(&cuContext, 0, cuDevice);if(error!=CUDA_SUCCESS){printf("Error happened in create context!\n");}int block_count=28*1000;int block_size=32*4*4;int thread_size=block_count*block_size;int data_size=sizeof(double)*thread_size;double *output_ptr=nullptr;double *input_ptr=nullptr;int cudaStatus=0;cudaStatus = cudaMalloc((void**)&input_ptr, data_size);cudaStatus = cudaMalloc((void**)&output_ptr, data_size);void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};CUmodule module;CUfunction double_function;CUfunction float_function;const char* module_file = "fp64_test.fatbin";const char* double_kernel_name = "_Z17kernel_add_doublePVdS0_";const char* float_kernel_name = "_Z16kernel_add_floatPVfS0_";error = cuModuleLoad(&module, module_file);if(error!=CUDA_SUCCESS){printf("Error happened in load moudle %d!\n",error);}error = cuModuleGetFunction(&double_function, module, double_kernel_name);if(error!=CUDA_SUCCESS){printf("get double_function error!\n");}error = cuModuleGetFunction(&float_function, module, float_kernel_name);if(error!=CUDA_SUCCESS){printf("get float_kernel_name error!\n");}    cuLaunchKernel(double_function,block_count, 1, 1,block_size, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(float_function,block_count, 1, 1,block_size, 1, 1,0,0,kernelParams, 0);cudaFree(output_ptr);cudaFree(input_ptr);cuModuleUnload(module);cuCtxDestroy(cuContext);return 0;
}
EOF
g++ fp64_test_main.cpp -o fp64_test_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics \
sm__inst_executed.avg.pct_of_peak_sustained_elapsed,\
smsp__inst_issued.sum,\
sm__issue_active.avg.pct_of_peak_sustained_elapsed,\
sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_elapsed,\
sm__pipe_fmaheavy_cycles_active.avg.pct_of_peak_sustained_elapsed,\
sm__inst_executed_pipe_cbu_pred_on_any.avg.pct_of_peak_sustained_elapsed,\
sm__mio_inst_issued.avg.pct_of_peak_sustained_elapsed,\
sm__mio_pq_read_cycles_active.avg.pct_of_peak_sustained_elapsed,\
sm__mio_pq_write_cycles_active.avg.pct_of_peak_sustained_elapsed,\
sm__mio_pq_write_cycles_active_pipe_lsu.avg.pct_of_peak_sustained_elapsed,\
sm__mio_pq_write_cycles_active_pipe_tex.avg.pct_of_peak_sustained_elapsed,\
sm__mioc_inst_issued.avg.pct_of_peak_sustained_elapsed,\
sm__mio_inst_issued.avg.pct_of_peak_sustained_elapsed,\
sm__pipe_fp64_cycles_active.avg.pct_of_peak_sustained_elapsed ./fp64_test_main

输出

kernel_add_double(volatile double *, volatile double *) (28000, 1, 1)x(512, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
------------------------------------------------------------------------- ----------- ------------
Metric Name                                                               Metric Unit Metric Value
------------------------------------------------------------------------- ----------- ------------
sm__inst_executed.avg.pct_of_peak_sustained_elapsed                                 %         1.32
sm__inst_executed_pipe_cbu_pred_on_any.avg.pct_of_peak_sustained_elapsed            %         0.02
sm__issue_active.avg.pct_of_peak_sustained_elapsed                                  %         1.32
sm__mio_inst_issued.avg.pct_of_peak_sustained_elapsed                               %         3.51
sm__mio_pq_read_cycles_active.avg.pct_of_peak_sustained_elapsed                     %            0
sm__mio_pq_write_cycles_active.avg.pct_of_peak_sustained_elapsed                    %        21.05
sm__mio_pq_write_cycles_active_pipe_lsu.avg.pct_of_peak_sustained_elapsed           %            0
sm__mio_pq_write_cycles_active_pipe_tex.avg.pct_of_peak_sustained_elapsed           %        21.05 # of cycles where register operands from the register file werewritten to MIO PQ, for the tex pipe
sm__mioc_inst_issued.avg.pct_of_peak_sustained_elapsed                              %         1.32
sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_elapsed                        %            0
sm__pipe_fmaheavy_cycles_active.avg.pct_of_peak_sustained_elapsed                   %            0
sm__pipe_fp64_cycles_active.avg.pct_of_peak_sustained_elapsed                       %        84.21  #利用率打不满
smsp__inst_issued.sum                                                            inst  115,136,000  #跟fp32相同的指令条数
------------------------------------------------------------------------- ----------- ------------kernel_add_float(volatile float *, volatile float *) (28000, 1, 1)x(512, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
------------------------------------------------------------------------- ----------- ------------
Metric Name                                                               Metric Unit Metric Value
------------------------------------------------------------------------- ----------- ------------
sm__inst_executed.avg.pct_of_peak_sustained_elapsed                                 %        99.76
sm__inst_executed_pipe_cbu_pred_on_any.avg.pct_of_peak_sustained_elapsed            %         1.55
sm__issue_active.avg.pct_of_peak_sustained_elapsed                                  %        99.76
sm__mio_inst_issued.avg.pct_of_peak_sustained_elapsed                               %            0
sm__mio_pq_read_cycles_active.avg.pct_of_peak_sustained_elapsed                     %            0
sm__mio_pq_write_cycles_active.avg.pct_of_peak_sustained_elapsed                    %            0
sm__mio_pq_write_cycles_active_pipe_lsu.avg.pct_of_peak_sustained_elapsed           %            0
sm__mio_pq_write_cycles_active_pipe_tex.avg.pct_of_peak_sustained_elapsed           %            0
sm__mioc_inst_issued.avg.pct_of_peak_sustained_elapsed                              %         0.39
sm__pipe_fma_cycles_active.avg.pct_of_peak_sustained_elapsed                        %        99.37
sm__pipe_fmaheavy_cycles_active.avg.pct_of_peak_sustained_elapsed                   %        99.37
sm__pipe_fp64_cycles_active.avg.pct_of_peak_sustained_elapsed                       %            0
smsp__inst_issued.sum                                                            inst  115,136,000
------------------------------------------------------------------------- ----------- ------------

*猜测,sm__pipe_fp64_cycles_active并不是那2个FP64 core的metrics,而是smsp里通往fp64 core的接口模块的活动cycle数
*4个subcore里的fp64接口模块,连接到2个fp64 core,并且经过了mio模块.因此,无法打满fp64的利用率

四.进一步证明pipe_fp64_cycles_active并不是2个fp64 core的metrics

tee fp64_test.cu<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>__global__ void kernel_add_double(volatile double *input,volatile double *output)
{unsigned int tid  = threadIdx.x + blockIdx.x * blockDim.x;double left=input[tid];double right=output[tid];for(int i=0;i<1;i++){left+=right;}       output[tid]=left;
}
EOF/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx fp64_test.cu -o fp64_test.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.ptx -cubin -o fp64_test.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
cat fp64_test.ptx
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbincuasm.py fp64_test.cubin fp64_test.cuasm
sed '/MOV/d' -i fp64_test.cuasm
sed '/S2R/d' -i fp64_test.cuasm
sed '/ULDC/d' -i fp64_test.cuasm
sed '/IMAD/d' -i fp64_test.cuasm
sed '/LDG/d' -i fp64_test.cuasm
sed '/STG/d' -i fp64_test.cuasm
sed '/F2F/d' -i fp64_test.cuasmcuasm.py fp64_test.cuasm
/usr/local/cuda/bin/nvcc -arch=sm_86 fp64_test.cubin -fatbin -o fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-sass fp64_test.fatbin
/usr/local/cuda/bin/cuobjdump --dump-resource-usage fp64_test.fatbintee fp64_test_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>int main(int argc,char *argv[])
{CUresult error;CUdevice cuDevice;cuInit(0);int deviceCount = 0;error = cuDeviceGetCount(&deviceCount);error = cuDeviceGet(&cuDevice, 0);if(error!=CUDA_SUCCESS){printf("Error happened in get device!\n");}CUcontext cuContext;error = cuCtxCreate(&cuContext, 0, cuDevice);if(error!=CUDA_SUCCESS){printf("Error happened in create context!\n");}int block_count=1;int block_size=32*4*4;int thread_size=block_count*block_size;int data_size=sizeof(double)*thread_size;double *output_ptr=nullptr;double *input_ptr=nullptr;int cudaStatus=0;cudaStatus = cudaMalloc((void**)&input_ptr, data_size);cudaStatus = cudaMalloc((void**)&output_ptr, data_size);void *kernelParams[]= {(void*)&output_ptr, (void*)&input_ptr};CUmodule module;CUfunction double_function;const char* module_file = "fp64_test.fatbin";const char* double_kernel_name = "_Z17kernel_add_doublePVdS0_";error = cuModuleLoad(&module, module_file);if(error!=CUDA_SUCCESS){printf("Error happened in load moudle %d!\n",error);}error = cuModuleGetFunction(&double_function, module, double_kernel_name);if(error!=CUDA_SUCCESS){printf("get float_kernel_name error!\n");}    cuLaunchKernel(double_function,block_count, 1, 1,8, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,16, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,32, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,32*2, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,32*4, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,32*5, 1, 1,0,0,kernelParams, 0);cuLaunchKernel(double_function,block_count, 1, 1,32*4*8, 1, 1,0,0,kernelParams, 0);cudaFree(output_ptr);cudaFree(input_ptr);cuModuleUnload(module);cuCtxDestroy(cuContext);return 0;
}
EOF
g++ fp64_test_main.cpp -o fp64_test_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__pipe_fp64_cycles_active ./fp64_test_main

输出

kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(8, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__pipe_fp64_cycles_active.avg       cycle         0.14
smsp__pipe_fp64_cycles_active.max       cycle           16
smsp__pipe_fp64_cycles_active.min       cycle            0
smsp__pipe_fp64_cycles_active.sum       cycle           16
--------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(16, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__pipe_fp64_cycles_active.avg       cycle         0.14
smsp__pipe_fp64_cycles_active.max       cycle           16  #不足一个warp跟一个warp 的pipe_fp64_cycles_active一样,说明存在无效计算
smsp__pipe_fp64_cycles_active.min       cycle            0
smsp__pipe_fp64_cycles_active.sum       cycle           16
--------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(32, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__pipe_fp64_cycles_active.avg       cycle         0.14
smsp__pipe_fp64_cycles_active.max       cycle           16
smsp__pipe_fp64_cycles_active.min       cycle            0
smsp__pipe_fp64_cycles_active.sum       cycle           16
--------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(64, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__pipe_fp64_cycles_active.avg       cycle         0.29
smsp__pipe_fp64_cycles_active.max       cycle           16
smsp__pipe_fp64_cycles_active.min       cycle            0
smsp__pipe_fp64_cycles_active.sum       cycle           32
--------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(128, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__pipe_fp64_cycles_active.avg       cycle         0.57
smsp__pipe_fp64_cycles_active.max       cycle           16
smsp__pipe_fp64_cycles_active.min       cycle            0
smsp__pipe_fp64_cycles_active.sum       cycle           64
--------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(160, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__pipe_fp64_cycles_active.avg       cycle         0.71
smsp__pipe_fp64_cycles_active.max       cycle           32
smsp__pipe_fp64_cycles_active.min       cycle            0
smsp__pipe_fp64_cycles_active.sum       cycle           80
--------------------------------- ----------- ------------kernel_add_double(volatile double *, volatile double *) (1, 1, 1)x(1024, 1, 1), Context 1, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
--------------------------------- ----------- ------------
Metric Name                       Metric Unit Metric Value
--------------------------------- ----------- ------------
smsp__pipe_fp64_cycles_active.avg       cycle         4.57
smsp__pipe_fp64_cycles_active.max       cycle          128
smsp__pipe_fp64_cycles_active.min       cycle            0
smsp__pipe_fp64_cycles_active.sum       cycle          512 #每个smsp 执行一个warp的fp64需要16个pipe_fp64_cycles_active
--------------------------------- ----------- ------------
  • 如果是2个fp64 cores的metrics,不会出现这样的现象
http://www.dinnco.com/news/30431.html

相关文章:

  • 独立站店铺怎么注册aso优化报价
  • 南开网站建设爱站seo工具
  • 网站设计公司网站专业如何进行搜索引擎营销
  • 青岛谷歌网站建设搜索引擎优化是什么工作
  • 长春建筑公司有哪些公司seo编辑招聘
  • 网站右侧 回到顶部百度seo在哪里
  • 网站建设开发设计营销公司山东百度本地惠生活推广
  • 太原网站制作报价seo实战培训教程
  • 什么是一学一做视频网站中国十大企业培训机构排名
  • 网站版面风格茶叶网络推广方案
  • 湛江做寄生虫网站seddog站长之家
  • 测试网站小程序怎么做品牌网站建设制作
  • 电商管理系统百度怎么优化排名
  • 企业网站做速优化排名万象网页开发工具
  • 筑招建筑人才网长春安全员seo关键字优化技巧
  • 镇江哪里做网站小升初最好的补课机构排行榜
  • 做外贸比较好得网站外国网站开放的浏览器
  • 找人做网站都需要提供什么seo查询平台
  • 专业做网站排名的人企业网站推广的形式有哪些
  • wordpress 手机支付乐天seo培训
  • 国内优秀网页设计案例天津百度seo
  • 青岛君哲网站建设公司重庆seo技术博客
  • 哪个网站可以付费做淘宝推广百度空间登录入口
  • 备案网站负责人网站自助建站系统
  • 乐基儿做黎明网站的女郎今日新闻最新事件
  • 赤峰市哪里做网站网络推广主要是做什么工作
  • 深圳外贸建站与推广西安优化排名推广
  • 镇海官方网站建设网页设计作品集
  • 本地环说wordpress配置邮箱seo一个月赚多少钱
  • 宝安区哪一个街道最富裕seo云优化如何