我写了个用CG法解方程式组的c程序
运算量不大 矩阵是2000@2000的 用openmp 之后 用ompgetwtime测时间
在pc上ubuntu里面明显2线程比不用openmp快将近1.6倍
但是为什么同样的程序在Mac上用了openmp之后2线程花的时间比不用openmp还多呢 4线程也只能和openmp的时候大概持平
想知道为什么 完全相同的代码
编译器都是GCC
分割线 代码如下(在日本读书 所以有些日语 但不影响理解)
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <omp.h>
#define N 2000
#define MAX 100000
#define EPS (1.0e-10)
#define TIMER 10
int cgm(double **amat, double *bvec, double *xvec, double *resh, int ndata, double
eps, int *n_his, int i_max){
int i, j, n;
double *pvec, *apvec, *axvec, *rvec;
double alpha, beta, r_norm_old, r_norm_new, pap_norm, b_norm;
double start_time, stop_time;
pvec = (double *)malloc(sizeof(double)*ndata);
apvec = (double *)malloc(sizeof(double)*ndata);
axvec = (double *)malloc(sizeof(double)*ndata);
rvec = (double *)malloc(sizeof(double)*ndata);
start_time = omp_get_wtime();
//
//Setting initial values
//
#pragma omp parallel for
for (i=0; i<ndata; i++){
xvec[i] = 0.0;
}
r_norm_old = 0.0;
b_norm = 0.0;
#pragma omp parallel for private(j) reduction(+:r_norm_old,b_norm)
for (i=0; i<ndata; i++){
axvec[i] = 0.0;
pvec[i] = 0.0;
for (j=0; j<ndata; j++){
axvec[i] += amat[i][j]*xvec[j];
}
rvec[i] = bvec[i] - axvec[i];
b_norm += bvec[i] * bvec[i];
r_norm_old += rvec[i]*rvec[i];
}
beta = 0.0;
resh[0] = 1.0;
//
// Main loop
//
*n_his = 1;
for (n=1; n<i_max; n++){
#pragma omp parallel for
for (i=0; i<ndata; i++){
pvec[i] = rvec[i] + beta*pvec[i];
}
pap_norm = 0.0;
#pragma omp parallel for private(i,j) reduction(+:pap_norm)
for (i=0; i<ndata; i++){
apvec[i] = 0.0;
for (j=0; j<ndata; j++){
apvec[i] += amat[i][j]*pvec[j];
}
pap_norm += pvec[i]*apvec[i];
}
alpha = r_norm_old/pap_norm;
r_norm_new = 0.0;
#pragma omp parallel for reduction(+:r_norm_new)
for (i=0; i<ndata; i++){
xvec[i] += alpha*pvec[i];
rvec[i] -= alpha*apvec[i];
r_norm_new += rvec[i]*rvec[i];
}
beta = r_norm_new/r_norm_old;
if (sqrt(r_norm_new) < eps*sqrt(b_norm)){
// for (i=0; i<ndata; i++){
// printf("xvec[%d] = %12.6e \n", i, xvec[i]);
// }
resh[*n_his] = sqrt(r_norm_new)/sqrt(b_norm);
for(i=0;i<N;i++){
free(amat[i]);
}
free(amat);
stop_time = omp_get_wtime();
printf("elapsed time=%lf s\n", stop_time - start_time);
return 0;
}else{
resh[*n_his] = sqrt(r_norm_new)/sqrt(b_norm);
(*n_his)++;
r_norm_old = r_norm_new;
}
}
for(i=0;i<N;i++){
free(amat[i]);
}
free(amat);
return -1;
}
void setup(double** a,double *x_setup,double *x,double *b){
int i,j;
double vxm;
for(i=0;i<N;i++){
for(j=0;j<N;j++){
if(j>i)
a[i][j]=2*(i+1)-1;
if(j<i||j==i)
a[i][j]=2*(j+1)-1;
}
}
for(i=0;i<N;i++){
x_setup[i]=1.0;
x[i]=0.0;
}
for(i=0;i<N;i++){
vxm=0;
for(j=0;j<N;j++){
vxm+=a[i][j]*x_setup[j];
}
b[i]=vxm;
}
}
int main(int argc,char *argv[]){
int i;
double **a;
int back;
char *s;
int threads=1;
if(argc>2||argc<=1){
printf("引数の数が違う、Thread数だけ引数として入力してください!\n");
exit(1);
}else{
s=argv[1];
threads=atoi(s);
printf("設定したThreads数は%dである.\n",threads);
}
/* set core number */
omp_set_num_threads(threads);
a=(double **)malloc(sizeof(double *)*N);
for(i=0;i<N;i++){
a[i]=(double *)malloc(sizeof(double)*N);
}
double x[N];
double x_setup[N];
double b[N];
double* resh = (double*)malloc(sizeof(double) * MAX);
int n_his;
setup(a,x_setup,x,b);
back=cgm(a,b,x,resh,N,EPS,&n_his,MAX);
return 0;
}