其他博客一般是要求安装你的电脑对应的版本,我直接装了nvidia-367(为了配合cuda8.0),也可以用。
你也可以参照:http://blog.csdn.net/xuzhongxiong/article/details/52717285
root@master# sudo add-apt-repository ppa:xorg-edgers/ppa
root@master# sudo apt-get update
root@master# sudo apt-get install nvidia-367
root@master# sudo apt-get install mesa-common-dev
root@master# sudo apt-get install freeglut3-dev
root@master# nvidia-smi
Sun Feb 11 11:18:43 2018
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 384.111 Driver Version: 384.111 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce 920M Off | 00000000:01:00.0 N/A | N/A |
| N/A 41C P5 N/A / N/A | 129MiB / 2004MiB | N/A Default |
+-------------------------------+----------------------+----------------------++-----------------------------------------------------------------------------+
| Processes: GPU Memory |
| GPU PID Type Process name Usage |
|=============================================================================|
| 0 Not Supported |
+-----------------------------------------------------------------------------+
安装就成功了
https://developer.nvidia.com/cuda-toolkit
一定要选择8.0,一共1.4G左右。
会有非常长的接受许可信息,一直ENTER直到输入accept。驱动之前已经安装,这里就不要选择安装驱动。其余的都直接默认或者选择是即可。
使用:
root@master# sudo sh cuda_8.0.27_linux.run
root@master# vim /etc/profile
export PATH=/usr/local/cuda-8.0/bin:$PATH
export LD_LIBRARY_PATH=/usr/local/cuda-8.0/lib64
root@master# source /etc/profile
root@master#
root@master#
root@master# cd /usr/local/cuda-8.0/samples/1_Utilities/deviceQuery
root@master# sudo make
root@master# ./deviceQueryCUDA Device Query (Runtime API) version (CUDART static linking)Detected 1 CUDA Capable device(s)Device 0: "GeForce 920M"CUDA Driver Version / Runtime Version 9.0 / 8.0CUDA Capability Major/Minor version number: 3.5Total amount of global memory: 2004 MBytes (2101542912 bytes)( 2) Multiprocessors, (192) CUDA Cores/MP: 384 CUDA CoresGPU Max Clock rate: 954 MHz (0.95 GHz)Memory Clock rate: 900 MhzMemory Bus Width: 64-bitL2 Cache Size: 524288 bytesMaximum Texture Dimension Size (x,y,z) 1D=(65536), 2D=(65536, 65536), 3D=(4096, 4096, 4096)Maximum Layered 1D Texture Size, (num) layers 1D=(16384), 2048 layersMaximum Layered 2D Texture Size, (num) layers 2D=(16384, 16384), 2048 layersTotal amount of constant memory: 65536 bytesTotal amount of shared memory per block: 49152 bytesTotal number of registers available per block: 65536Warp size: 32Maximum number of threads per multiprocessor: 2048Maximum number of threads per block: 1024Max dimension size of a thread block (x,y,z): (1024, 1024, 64)Max dimension size of a grid size (x,y,z): (2147483647, 65535, 65535)Maximum memory pitch: 2147483647 bytesTexture alignment: 512 bytesConcurrent copy and kernel execution: Yes with 1 copy engine(s)Run time limit on kernels: YesIntegrated GPU sharing Host Memory: NoSupport host page-locked memory mapping: YesAlignment requirement for Surfaces: YesDevice has ECC support: DisabledDevice supports Unified Addressing (UVA): YesDevice PCI Domain ID / Bus ID / location ID: 0 / 1 / 0Compute Mode:
Result = PASS
cuda的编译语句
root@master# "/usr/local/cuda-8.0"/bin/nvcc -ccbin g++ -I../../common/inc -m64
-gencode arch=compute_20,code=sm_20
-gencode arch=compute_30,code=sm_30
-gencode arch=compute_35,code=sm_35
-gencode arch=compute_37,code=sm_37
-gencode arch=compute_50,code=sm_50
-gencode arch=compute_52,code=sm_52
-gencode arch=compute_60,code=sm_60
-gencode arch=compute_60,code=compute_60
-o MonteCarloMultiGPU.o -c MonteCarloMultiGPU.cpp
程序
#include
#include
#include
#include
#include
#include
#include
#include
char **pArgv = NULL;#ifdef WIN32
#define strcasecmp _strcmpi
#endif////////////////////////////////////////////////////////////////////////////////
// Common functions
////////////////////////////////////////////////////////////////////////////////
float randFloat(float low, float high)
{float t = (float)rand() / (float)RAND_MAX;return (1.0f - t) * low + t * high;
}/// Utility function to tweak problem size for small GPUs
int adjustProblemSize(int GPU_N, int default_nOptions)
{int nOptions = default_nOptions;// select problem sizefor (int i=0; i
}int adjustGridSize(int GPUIndex, int defaultGridSize)
{cudaDeviceProp deviceProp;checkCudaErrors(cudaGetDeviceProperties(&deviceProp, GPUIndex));int maxGridSize = deviceProp.multiProcessorCount * 40;return ((defaultGridSize > maxGridSize) ? maxGridSize : defaultGridSize);
}///////////////////////////////////////////////////////////////////////////////
// CPU reference functions
///////////////////////////////////////////////////////////////////////////////
extern "C" void MonteCarloCPU(TOptionValue &callValue,TOptionData optionData,float *h_Random,int pathN
);//Black-Scholes formula for call options
extern "C" void BlackScholesCall(float &CallResult,TOptionData optionData
);////////////////////////////////////////////////////////////////////////////////
// GPU-driving host thread
////////////////////////////////////////////////////////////////////////////////
//Timer
StopWatchInterface **hTimer = NULL;static CUT_THREADPROC solverThread(TOptionPlan *plan)
{//Init GPUcheckCudaErrors(cudaSetDevice(plan->device));cudaDeviceProp deviceProp;checkCudaErrors(cudaGetDeviceProperties(&deviceProp, plan->device));//Start the timersdkStartTimer(&hTimer[plan->device]);// Allocate intermediate memory for MC integrator and initialize// RNG statesinitMonteCarloGPU(plan);// Main computationMonteCarloGPU(plan);checkCudaErrors(cudaDeviceSynchronize());//Stop the timersdkStopTimer(&hTimer[plan->device]);//Shut down this GPUcloseMonteCarloGPU(plan);cudaStreamSynchronize(0);printf("solverThread() finished - GPU Device %d: %s\n", plan->device, deviceProp.name);CUT_THREADEND;
}static void multiSolver(TOptionPlan *plan, int nPlans)
{// allocate and initialize an array of stream handlescudaStream_t *streams = (cudaStream_t *) malloc(nPlans * sizeof(cudaStream_t));cudaEvent_t *events = (cudaEvent_t *)malloc(nPlans * sizeof(cudaEvent_t));for (int i = 0; i
// Main program
///////////////////////////////////////////////////////////////////////////////
#define DO_CPU
#undef DO_CPU#define PRINT_RESULTS
#undef PRINT_RESULTSvoid usage()
{printf("--method=[threaded,streamed] --scaling=[strong,weak] [--help]\n");printf("Method=threaded: 1 CPU thread for each GPU [default]\n");printf(" streamed: 1 CPU thread handles all GPUs (requires CUDA 4.0 or newer)\n");printf("Scaling=strong : constant problem size\n");printf(" weak : problem size scales with number of available GPUs [default]\n");
}int main(int argc, char **argv)
{char *multiMethodChoice = NULL;char *scalingChoice = NULL;bool use_threads = true;bool bqatest = false;bool strongScaling = false;pArgc = &argc;pArgv = argv;printf("%s Starting...\n\n", argv[0]);if (checkCmdLineFlag(argc, (const char **)argv, "qatest")){bqatest = true;}getCmdLineArgumentString(argc, (const char **)argv, "method", &multiMethodChoice);getCmdLineArgumentString(argc, (const char **)argv, "scaling", &scalingChoice);if (checkCmdLineFlag(argc, (const char **)argv, "h") ||checkCmdLineFlag(argc, (const char **)argv, "help")){usage();exit(EXIT_SUCCESS);}if (multiMethodChoice == NULL){use_threads = false;}else{if (!strcasecmp(multiMethodChoice, "threaded")){use_threads = true;}else{use_threads = false;}}if (use_threads == false){printf("Using single CPU thread for multiple GPUs\n");}if (scalingChoice == NULL){strongScaling = false;}else{if (!strcasecmp(scalingChoice, "strong")){strongScaling = true;}else{strongScaling = false;}}//GPU number present in the systemint GPU_N;checkCudaErrors(cudaGetDeviceCount(&GPU_N));int nOptions = 8 * 1024;nOptions = adjustProblemSize(GPU_N, nOptions);// select problem sizeint scale = (strongScaling) ? 1 : GPU_N;int OPT_N = nOptions * scale;int PATH_N = 262144;// initialize the timershTimer = new StopWatchInterface*[GPU_N];for (int i=0; i
#endif}sumReserve /= OPT_N;}if (!use_threads || bqatest){multiSolver(optionSolver, GPU_N);printf("main(): GPU statistics, streamed\n");for (i = 0; i
#endif}sumReserve /= OPT_N;}#ifdef DO_CPUprintf("main(): running CPU MonteCarlo...\n");TOptionValue callValueCPU;sumDelta = 0;sumRef = 0;for (i = 0; i
#endifprintf("Shutting down...\n");for (int i=0; i
}