Why I had install ninja with conda but still met this bug?? Please help me! T_T
ninja --version
1.7.2
$ nvcc --version
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243
pytorch 1.2.0
py3.7_cuda10.0.130_cudnn7.6.2_0
output
Traceback (most recent call last):
File "/home/lowen/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/cpp_extension.py", line 890, in verify_ninja_availability
subprocess.check_call('ninja --version'.split(), stdout=devnull)
File "/home/lowen/anaconda3/envs/pytorch/lib/python3.7/subprocess.py", line 342, in check_call
retcode = call(*popenargs, **kwargs)
File "/home/lowen/anaconda3/envs/pytorch/lib/python3.7/subprocess.py", line 323, in call
with Popen(*popenargs, **kwargs) as p:
File "/home/lowen/anaconda3/envs/pytorch/lib/python3.7/subprocess.py", line 775, in __init__
restore_signals, start_new_session)
File "/home/lowen/anaconda3/envs/pytorch/lib/python3.7/subprocess.py", line 1522, in _execute_child
raise child_exception_type(errno_num, err_msg, err_filename)
FileNotFoundError: [Errno 2] No such file or directory: 'ninja': 'ninja'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/devdata/new_Relation_Extraction/test_wasserstein.py", line 208, in <module>
extra_cuda_cflags=["--expt-relaxed-constexpr"])
File "/home/lowen/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/cpp_extension.py", line 787, in load_inline
is_python_module)
File "/home/lowen/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/cpp_extension.py", line 827, in _jit_compile
with_cuda=with_cuda)
File "/home/lowen/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/cpp_extension.py", line 850, in _write_ninja_file_and_build
verify_ninja_availability()
File "/home/lowen/anaconda3/envs/pytorch/lib/python3.7/site-packages/torch/utils/cpp_extension.py", line 892, in verify_ninja_availability
raise RuntimeError("Ninja is required to load C++ extensions")
RuntimeError: Ninja is required to load C++ extensions
code
import math
import torch
import torch.utils
import torch.utils.cpp_extension
# % matplotlib inline
#
# from matplotlib import pyplot
# import matplotlib.transforms
#
# import ot # for comparison
cuda_source = """
#include <torch/extension.h>
#include <ATen/core/TensorAccessor.h>
#include <ATen/cuda/CUDAContext.h>
using at::RestrictPtrTraits;
using at::PackedTensorAccessor;
#if defined(__HIP_PLATFORM_HCC__)
constexpr int WARP_SIZE = 64;
#else
constexpr int WARP_SIZE = 32;
#endif
// The maximum number of threads in a block
#if defined(__HIP_PLATFORM_HCC__)
constexpr int MAX_BLOCK_SIZE = 256;
#else
constexpr int MAX_BLOCK_SIZE = 512;
#endif
// Returns the index of the most significant 1 bit in `val`.
__device__ __forceinline__ int getMSB(int val) {
return 31 - __clz(val);
}
// Number of threads in a block given an input size up to MAX_BLOCK_SIZE
static int getNumThreads(int nElem) {
#if defined(__HIP_PLATFORM_HCC__)
int threadSizes[5] = { 16, 32, 64, 128, MAX_BLOCK_SIZE };
#else
int threadSizes[5] = { 32, 64, 128, 256, MAX_BLOCK_SIZE };
#endif
for (int i = 0; i != 5; ++i) {
if (nElem <= threadSizes[i]) {
return threadSizes[i];
}
}
return MAX_BLOCK_SIZE;
}
template <typename T>
__device__ __forceinline__ T WARP_SHFL_XOR(T value, int laneMask, int width = warpSize, unsigned int mask = 0xffffffff)
{
#if CUDA_VERSION >= 9000
return __shfl_xor_sync(mask, value, laneMask, width);
#else
return __shfl_xor(value, laneMask, width);
#endif
}
// While this might be the most efficient sinkhorn step / logsumexp-matmul implementation I have seen,
// this is awfully inefficient compared to matrix multiplication and e.g. NVidia cutlass may provide
// many great ideas for improvement
template <typename scalar_t, typename index_t>
__global__ void sinkstep_kernel(
// compute log v_bj = log nu_bj - logsumexp_i 1/lambda dist_ij - log u_bi
// for this compute maxdiff_bj = max_i(1/lambda dist_ij - log u_bi)
// i = reduction dim, using threadIdx.x
PackedTensorAccessor<scalar_t, 2, RestrictPtrTraits, index_t> log_v,
const PackedTensorAccessor<scalar_t, 2, RestrictPtrTraits, index_t> dist,
const PackedTensorAccessor<scalar_t, 2, RestrictPtrTraits, index_t> log_nu,
const PackedTensorAccessor<scalar_t, 2, RestrictPtrTraits, index_t> log_u,
const scalar_t lambda) {
using accscalar_t = scalar_t;
__shared__ accscalar_t shared_mem[2 * WARP_SIZE];
index_t b = blockIdx.y;
index_t j = blockIdx.x;
int tid = threadIdx.x;
if (b >= log_u.size(0) || j >= log_v.size(1)) {
return;
}
// reduce within thread
accscalar_t max = -std::numeric_limits<accscalar_t>::infinity();
accscalar_t sumexp = 0;
if (log_nu[b][j] == -std::numeric_limits<accscalar_t>::infinity()) {
if (tid == 0) {
log_v[b][j] = -std::numeric_limits<accscalar_t>::infinity();
}
return;
}
for (index_t i = threadIdx.x; i < log_u.size(1); i += blockDim.x) {
accscalar_t oldmax = max;
accscalar_t value = -dist[i][j]/lambda + log_u[b][i];
max = max > value ? max : value;
if (oldmax == -std::numeric_limits<accscalar_t>::infinity()) {
// sumexp used to be 0, so the new max is value and we can set 1 here,
// because we will come back here again
sumexp = 1;
} else {
sumexp *= exp(oldmax - max);
sumexp += exp(value - max); // if oldmax was not -infinity, max is not either...
}
}
// now we have one value per thread. we'll make it into one value per warp
// first warpSum to get one value per thread to
// one value per warp
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
accscalar_t o_max = WARP_SHFL_XOR(max, 1 << i, WARP_SIZE);
accscalar_t o_sumexp = WARP_SHFL_XOR(sumexp, 1 << i, WARP_SIZE);
if (o_max > max) { // we're less concerned about divergence here
sumexp *= exp(max - o_max);
sumexp += o_sumexp;
max = o_max;
} else if (max != -std::numeric_limits<accscalar_t>::infinity()) {
sumexp += o_sumexp * exp(o_max - max);
}
}
__syncthreads();
// this writes each warps accumulation into shared memory
// there are at most WARP_SIZE items left because
// there are at most WARP_SIZE**2 threads at the beginning
if (tid % WARP_SIZE == 0) {
shared_mem[tid / WARP_SIZE * 2] = max;
shared_mem[tid / WARP_SIZE * 2 + 1] = sumexp;
}
__syncthreads();
if (tid < WARP_SIZE) {
max = (tid < blockDim.x / WARP_SIZE ? shared_mem[2 * tid] : -std::numeric_limits<accscalar_t>::infinity());
sumexp = (tid < blockDim.x / WARP_SIZE ? shared_mem[2 * tid + 1] : 0);
}
for (int i = 0; i < getMSB(WARP_SIZE); ++i) {
accscalar_t o_max = WARP_SHFL_XOR(max, 1 << i, WARP_SIZE);
accscalar_t o_sumexp = WARP_SHFL_XOR(sumexp, 1 << i, WARP_SIZE);
if (o_max > max) { // we're less concerned about divergence here
sumexp *= exp(max - o_max);
sumexp += o_sumexp;
max = o_max;
} else if (max != -std::numeric_limits<accscalar_t>::infinity()) {
sumexp += o_sumexp * exp(o_max - max);
}
}
if (tid == 0) {
log_v[b][j] = (max > -std::numeric_limits<accscalar_t>::infinity() ?
log_nu[b][j] - log(sumexp) - max :
-std::numeric_limits<accscalar_t>::infinity());
}
}
template <typename scalar_t>
torch::Tensor sinkstep_cuda_template(const torch::Tensor& dist, const torch::Tensor& log_nu, const torch::Tensor& log_u,
const double lambda) {
TORCH_CHECK(dist.is_cuda(), "need cuda tensors");
TORCH_CHECK(dist.device() == log_nu.device() && dist.device() == log_u.device(), "need tensors on same GPU");
TORCH_CHECK(dist.dim()==2 && log_nu.dim()==2 && log_u.dim()==2, "invalid sizes");
TORCH_CHECK(dist.size(0) == log_u.size(1) &&
dist.size(1) == log_nu.size(1) &&
log_u.size(0) == log_nu.size(0), "invalid sizes");
auto log_v = torch::empty_like(log_nu);
using index_t = int32_t;
auto log_v_a = log_v.packed_accessor<scalar_t, 2, RestrictPtrTraits, index_t>();
auto dist_a = dist.packed_accessor<scalar_t, 2, RestrictPtrTraits, index_t>();
auto log_nu_a = log_nu.packed_accessor<scalar_t, 2, RestrictPtrTraits, index_t>();
auto log_u_a = log_u.packed_accessor<scalar_t, 2, RestrictPtrTraits, index_t>();
auto stream = at::cuda::getCurrentCUDAStream();
int tf = getNumThreads(log_u.size(1));
dim3 blocks(log_v.size(1), log_u.size(0));
dim3 threads(tf);
sinkstep_kernel<<<blocks, threads, 2*WARP_SIZE*sizeof(scalar_t), stream>>>(
log_v_a, dist_a, log_nu_a, log_u_a, static_cast<scalar_t>(lambda)
);
return log_v;
}
torch::Tensor sinkstep_cuda(const torch::Tensor& dist, const torch::Tensor& log_nu, const torch::Tensor& log_u,
const double lambda) {
return AT_DISPATCH_FLOATING_TYPES(log_u.scalar_type(), "sinkstep", [&] {
return sinkstep_cuda_template<scalar_t>(dist, log_nu, log_u, lambda);
});
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("sinkstep", &sinkstep_cuda, "sinkhorn step");
}
"""
wasserstein_ext = torch.utils.cpp_extension.load_inline("wasserstein", cpp_sources="", cuda_sources=cuda_source,
extra_cuda_cflags=["--expt-relaxed-constexpr"])