OpenMP C program run slower than sequential code
I am a newbie to OpenMP, trying to parallelize Jarvis's algorithm. However it turns out that the parallel program take 2-3 times longer compare to sequential code.
Is it that the problem itself cannot be parallelize? Or there is something wrong in how i parallelize it.
This is my openMP program for the problem, with 2 parts being parallelize:
#include <stdio.h>
#include <sys/time.h>
#include <omp.h>
typedef struct Point
{
int x, y;
} Point;
// To find orientation of ordered triplet (p, q, r).
// The function returns
// 0 for colinear points
// 1 as Clockwise
// 2 as Counterclockwise
int orientation(Point p, Point i, Point q)
{
int val = (i.y - p.y) * (q.x - i.x) -
(i.x - p.x) * (q.y - i.y);
if (val == 0) return 0; // colinear
return (val > 0)? 1: 2; // clock or counterclock wise
}
// Prints convex hull of a set of n points.
void convexHull(Point points, int n)
{
// There must be at least 3 points
if (n < 3) return;
// Initialize array to store results
Point results[n];
int count = 0;
// Find the leftmost point
int l = 0,i;
#pragma omg parallel shared (n,l) private (i)
{
#pragma omp for
for (i = 1; i < n; i++)
{
#pragma omp critical
{
if (points[i].x < points[l].x)
l = i;
}
}
}
// Start from leftmost point, keep moving counterclockwise
// until reach the start point again.
int p = l, q;
do
{
// Add current point to result
results[count]= points[p];
count++;
q = (p+1)%n;
int k;
#pragma omp parallel shared (p) private (k)
{
#pragma omp for
for (k = 0; k < n; k++)
{
// If i is more counterclockwise than current q, then
// update i as new q
#pragma omp critical
{
if (orientation(points[p], points[k], points[q]) == 2)
q = k;
}
}
}
// Now q is the most counterclockwise with respect to p
// Set p as q for next iteration, to add q to result
p = q;
} while (p != l); // While algorithm does not return to first point
// Print Result
int j;
for (j = 0; j < count; j++){
printf("(%d,%d)n", results[j].x,results[j].y);
}
}
int main()
{
//declaration for start time, end time
//and total executions for the algorithm
struct timeval start, end;
int i, num_run = 100;
gettimeofday(&start,NULL);
Point points = {{0, 3}, {2, 2}, {1, 1}, {2, 1},
{3, 0}, {0, 0}, {3, 3}};
int n = sizeof(points)/sizeof(points[0]);
convexHull(points, n);
gettimeofday(&end,NULL);
int cpu_time_used = (((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec
- start.tv_usec));
printf("nnExecution time: %d msn", cpu_time_used);
return 0;
}
Tried to make the input subtantial enough by adding in these lines of code:
Point points[3000];
int i;
for(i=0;i<3000;i++) {
points[i].x = rand()%100;
points[i].y = rand()%100;
int j;
for(j=i+1;j<3000;j++) {
if(points[i].x==points[j].x) {
if(points[i].y==points[j].y) {
i--;
break;
}
}
}
}
But it crashes sometimes
c openmp convex-hull
add a comment |
I am a newbie to OpenMP, trying to parallelize Jarvis's algorithm. However it turns out that the parallel program take 2-3 times longer compare to sequential code.
Is it that the problem itself cannot be parallelize? Or there is something wrong in how i parallelize it.
This is my openMP program for the problem, with 2 parts being parallelize:
#include <stdio.h>
#include <sys/time.h>
#include <omp.h>
typedef struct Point
{
int x, y;
} Point;
// To find orientation of ordered triplet (p, q, r).
// The function returns
// 0 for colinear points
// 1 as Clockwise
// 2 as Counterclockwise
int orientation(Point p, Point i, Point q)
{
int val = (i.y - p.y) * (q.x - i.x) -
(i.x - p.x) * (q.y - i.y);
if (val == 0) return 0; // colinear
return (val > 0)? 1: 2; // clock or counterclock wise
}
// Prints convex hull of a set of n points.
void convexHull(Point points, int n)
{
// There must be at least 3 points
if (n < 3) return;
// Initialize array to store results
Point results[n];
int count = 0;
// Find the leftmost point
int l = 0,i;
#pragma omg parallel shared (n,l) private (i)
{
#pragma omp for
for (i = 1; i < n; i++)
{
#pragma omp critical
{
if (points[i].x < points[l].x)
l = i;
}
}
}
// Start from leftmost point, keep moving counterclockwise
// until reach the start point again.
int p = l, q;
do
{
// Add current point to result
results[count]= points[p];
count++;
q = (p+1)%n;
int k;
#pragma omp parallel shared (p) private (k)
{
#pragma omp for
for (k = 0; k < n; k++)
{
// If i is more counterclockwise than current q, then
// update i as new q
#pragma omp critical
{
if (orientation(points[p], points[k], points[q]) == 2)
q = k;
}
}
}
// Now q is the most counterclockwise with respect to p
// Set p as q for next iteration, to add q to result
p = q;
} while (p != l); // While algorithm does not return to first point
// Print Result
int j;
for (j = 0; j < count; j++){
printf("(%d,%d)n", results[j].x,results[j].y);
}
}
int main()
{
//declaration for start time, end time
//and total executions for the algorithm
struct timeval start, end;
int i, num_run = 100;
gettimeofday(&start,NULL);
Point points = {{0, 3}, {2, 2}, {1, 1}, {2, 1},
{3, 0}, {0, 0}, {3, 3}};
int n = sizeof(points)/sizeof(points[0]);
convexHull(points, n);
gettimeofday(&end,NULL);
int cpu_time_used = (((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec
- start.tv_usec));
printf("nnExecution time: %d msn", cpu_time_used);
return 0;
}
Tried to make the input subtantial enough by adding in these lines of code:
Point points[3000];
int i;
for(i=0;i<3000;i++) {
points[i].x = rand()%100;
points[i].y = rand()%100;
int j;
for(j=i+1;j<3000;j++) {
if(points[i].x==points[j].x) {
if(points[i].y==points[j].y) {
i--;
break;
}
}
}
}
But it crashes sometimes
c openmp convex-hull
3
For small data sets parallelization may be slower. The cause is the overhead in creating and managing the threads.
– Osiris
Nov 21 '18 at 10:59
I suggest to try out another algorithm that you know is highly "parallelizable" or enlarge the dataset as suggest by @Osiris
– Welgriv
Nov 21 '18 at 11:08
add a comment |
I am a newbie to OpenMP, trying to parallelize Jarvis's algorithm. However it turns out that the parallel program take 2-3 times longer compare to sequential code.
Is it that the problem itself cannot be parallelize? Or there is something wrong in how i parallelize it.
This is my openMP program for the problem, with 2 parts being parallelize:
#include <stdio.h>
#include <sys/time.h>
#include <omp.h>
typedef struct Point
{
int x, y;
} Point;
// To find orientation of ordered triplet (p, q, r).
// The function returns
// 0 for colinear points
// 1 as Clockwise
// 2 as Counterclockwise
int orientation(Point p, Point i, Point q)
{
int val = (i.y - p.y) * (q.x - i.x) -
(i.x - p.x) * (q.y - i.y);
if (val == 0) return 0; // colinear
return (val > 0)? 1: 2; // clock or counterclock wise
}
// Prints convex hull of a set of n points.
void convexHull(Point points, int n)
{
// There must be at least 3 points
if (n < 3) return;
// Initialize array to store results
Point results[n];
int count = 0;
// Find the leftmost point
int l = 0,i;
#pragma omg parallel shared (n,l) private (i)
{
#pragma omp for
for (i = 1; i < n; i++)
{
#pragma omp critical
{
if (points[i].x < points[l].x)
l = i;
}
}
}
// Start from leftmost point, keep moving counterclockwise
// until reach the start point again.
int p = l, q;
do
{
// Add current point to result
results[count]= points[p];
count++;
q = (p+1)%n;
int k;
#pragma omp parallel shared (p) private (k)
{
#pragma omp for
for (k = 0; k < n; k++)
{
// If i is more counterclockwise than current q, then
// update i as new q
#pragma omp critical
{
if (orientation(points[p], points[k], points[q]) == 2)
q = k;
}
}
}
// Now q is the most counterclockwise with respect to p
// Set p as q for next iteration, to add q to result
p = q;
} while (p != l); // While algorithm does not return to first point
// Print Result
int j;
for (j = 0; j < count; j++){
printf("(%d,%d)n", results[j].x,results[j].y);
}
}
int main()
{
//declaration for start time, end time
//and total executions for the algorithm
struct timeval start, end;
int i, num_run = 100;
gettimeofday(&start,NULL);
Point points = {{0, 3}, {2, 2}, {1, 1}, {2, 1},
{3, 0}, {0, 0}, {3, 3}};
int n = sizeof(points)/sizeof(points[0]);
convexHull(points, n);
gettimeofday(&end,NULL);
int cpu_time_used = (((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec
- start.tv_usec));
printf("nnExecution time: %d msn", cpu_time_used);
return 0;
}
Tried to make the input subtantial enough by adding in these lines of code:
Point points[3000];
int i;
for(i=0;i<3000;i++) {
points[i].x = rand()%100;
points[i].y = rand()%100;
int j;
for(j=i+1;j<3000;j++) {
if(points[i].x==points[j].x) {
if(points[i].y==points[j].y) {
i--;
break;
}
}
}
}
But it crashes sometimes
c openmp convex-hull
I am a newbie to OpenMP, trying to parallelize Jarvis's algorithm. However it turns out that the parallel program take 2-3 times longer compare to sequential code.
Is it that the problem itself cannot be parallelize? Or there is something wrong in how i parallelize it.
This is my openMP program for the problem, with 2 parts being parallelize:
#include <stdio.h>
#include <sys/time.h>
#include <omp.h>
typedef struct Point
{
int x, y;
} Point;
// To find orientation of ordered triplet (p, q, r).
// The function returns
// 0 for colinear points
// 1 as Clockwise
// 2 as Counterclockwise
int orientation(Point p, Point i, Point q)
{
int val = (i.y - p.y) * (q.x - i.x) -
(i.x - p.x) * (q.y - i.y);
if (val == 0) return 0; // colinear
return (val > 0)? 1: 2; // clock or counterclock wise
}
// Prints convex hull of a set of n points.
void convexHull(Point points, int n)
{
// There must be at least 3 points
if (n < 3) return;
// Initialize array to store results
Point results[n];
int count = 0;
// Find the leftmost point
int l = 0,i;
#pragma omg parallel shared (n,l) private (i)
{
#pragma omp for
for (i = 1; i < n; i++)
{
#pragma omp critical
{
if (points[i].x < points[l].x)
l = i;
}
}
}
// Start from leftmost point, keep moving counterclockwise
// until reach the start point again.
int p = l, q;
do
{
// Add current point to result
results[count]= points[p];
count++;
q = (p+1)%n;
int k;
#pragma omp parallel shared (p) private (k)
{
#pragma omp for
for (k = 0; k < n; k++)
{
// If i is more counterclockwise than current q, then
// update i as new q
#pragma omp critical
{
if (orientation(points[p], points[k], points[q]) == 2)
q = k;
}
}
}
// Now q is the most counterclockwise with respect to p
// Set p as q for next iteration, to add q to result
p = q;
} while (p != l); // While algorithm does not return to first point
// Print Result
int j;
for (j = 0; j < count; j++){
printf("(%d,%d)n", results[j].x,results[j].y);
}
}
int main()
{
//declaration for start time, end time
//and total executions for the algorithm
struct timeval start, end;
int i, num_run = 100;
gettimeofday(&start,NULL);
Point points = {{0, 3}, {2, 2}, {1, 1}, {2, 1},
{3, 0}, {0, 0}, {3, 3}};
int n = sizeof(points)/sizeof(points[0]);
convexHull(points, n);
gettimeofday(&end,NULL);
int cpu_time_used = (((end.tv_sec - start.tv_sec) * 1000000) + (end.tv_usec
- start.tv_usec));
printf("nnExecution time: %d msn", cpu_time_used);
return 0;
}
Tried to make the input subtantial enough by adding in these lines of code:
Point points[3000];
int i;
for(i=0;i<3000;i++) {
points[i].x = rand()%100;
points[i].y = rand()%100;
int j;
for(j=i+1;j<3000;j++) {
if(points[i].x==points[j].x) {
if(points[i].y==points[j].y) {
i--;
break;
}
}
}
}
But it crashes sometimes
c openmp convex-hull
c openmp convex-hull
edited Nov 26 '18 at 12:25
Brice
1,400110
1,400110
asked Nov 21 '18 at 10:55
Yeoh FSYeoh FS
43
43
3
For small data sets parallelization may be slower. The cause is the overhead in creating and managing the threads.
– Osiris
Nov 21 '18 at 10:59
I suggest to try out another algorithm that you know is highly "parallelizable" or enlarge the dataset as suggest by @Osiris
– Welgriv
Nov 21 '18 at 11:08
add a comment |
3
For small data sets parallelization may be slower. The cause is the overhead in creating and managing the threads.
– Osiris
Nov 21 '18 at 10:59
I suggest to try out another algorithm that you know is highly "parallelizable" or enlarge the dataset as suggest by @Osiris
– Welgriv
Nov 21 '18 at 11:08
3
3
For small data sets parallelization may be slower. The cause is the overhead in creating and managing the threads.
– Osiris
Nov 21 '18 at 10:59
For small data sets parallelization may be slower. The cause is the overhead in creating and managing the threads.
– Osiris
Nov 21 '18 at 10:59
I suggest to try out another algorithm that you know is highly "parallelizable" or enlarge the dataset as suggest by @Osiris
– Welgriv
Nov 21 '18 at 11:08
I suggest to try out another algorithm that you know is highly "parallelizable" or enlarge the dataset as suggest by @Osiris
– Welgriv
Nov 21 '18 at 11:08
add a comment |
1 Answer
1
active
oldest
votes
In the following piece of your code, the whole content of the parallel for
loop is wrapped into a critical
statement. This means that this part of the code will never be entered by more than on thread at a time. Having multiple threads work one at a time will not go faster than if a single thread had gone through all iterations. But on top of that some time is lost in synchronization overhead (each thread must acquire a mutex before entering the critical section and release it afterwards).
int l = 0,i;
#pragma omp parallel shared (n,l) private (i)
{
#pragma omp for
for (i = 1; i < n; i++)
{
#pragma omp critical
{
if (points[i].x < points[l].x)
l = i;
}
}
}
The serial code needs to be somewhat refactored for parallelization. Reduction is often a good approach for simple operations: have each thread compute a partial result on one part of the iterations (e.g. partial minimum, partial sum) than merge all the results into a global one. For supported operations, the #pragma omp for reduction(op:var)
syntax can be used. But in this case, the reduction has to be done manually.
See how the following code relies on local variables to track the index of minimum x
, then uses a single critical section to compute the global minimum index.
int l = 0,i;
#pragma omp parallel shared (n,l) private (i)
{
int l_local = 0; //This variable is private to the thread
#pragma omp for nowait
for (i = 1; i < n; i++)
{
// This part of the code can be executed in parallel
// since all write operations are on thread-local variables
if (points[i].x < points[l_local].x)
l_local = i;
}
// The critical section is entered only once by each thread
#pragma omp critical
{
if (points[l_local].x < points[l].x)
l = l_local;
}
#pragma omp barrier
// a barrier is needed in case some more code follow
// otherwise there is an implicit barrier at the end of the parallel region
}
The same principle should be applied to the second parallel loop, which suffer from the same issue of actually being entirely serialized by the critical
statement.
Since variablen
is not modified, I see no reason why it should be shared. Making it private may afford lighter-weight accesses by the various participating threads.
– John Bollinger
Nov 21 '18 at 14:58
Then it has to be tagged asfirstprivate(n)
so that the private copy within each thread is initialized with the value of the pre-existing variable. I do not think it will have much impact, though (with a static schedule, the master thread will access its value once then distribute the work among the threads which will then loop based on their own internal, private variables).
– Brice
Nov 21 '18 at 17:45
Tried to make the input subtantial enough by adding in these lines of code: Point points[3000]; int i; for(i=0;i<3000;i++) { points[i].x = rand()%100; points[i].y = rand()%100; int j; for(j=i+1;j<3000;j++) { if(points[i].x==points[j].x) { if(points[i].y==points[j].y) { i--; break; } } } } But it crashes sometimes
– Yeoh FS
Nov 25 '18 at 10:38
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53410585%2fopenmp-c-program-run-slower-than-sequential-code%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
In the following piece of your code, the whole content of the parallel for
loop is wrapped into a critical
statement. This means that this part of the code will never be entered by more than on thread at a time. Having multiple threads work one at a time will not go faster than if a single thread had gone through all iterations. But on top of that some time is lost in synchronization overhead (each thread must acquire a mutex before entering the critical section and release it afterwards).
int l = 0,i;
#pragma omp parallel shared (n,l) private (i)
{
#pragma omp for
for (i = 1; i < n; i++)
{
#pragma omp critical
{
if (points[i].x < points[l].x)
l = i;
}
}
}
The serial code needs to be somewhat refactored for parallelization. Reduction is often a good approach for simple operations: have each thread compute a partial result on one part of the iterations (e.g. partial minimum, partial sum) than merge all the results into a global one. For supported operations, the #pragma omp for reduction(op:var)
syntax can be used. But in this case, the reduction has to be done manually.
See how the following code relies on local variables to track the index of minimum x
, then uses a single critical section to compute the global minimum index.
int l = 0,i;
#pragma omp parallel shared (n,l) private (i)
{
int l_local = 0; //This variable is private to the thread
#pragma omp for nowait
for (i = 1; i < n; i++)
{
// This part of the code can be executed in parallel
// since all write operations are on thread-local variables
if (points[i].x < points[l_local].x)
l_local = i;
}
// The critical section is entered only once by each thread
#pragma omp critical
{
if (points[l_local].x < points[l].x)
l = l_local;
}
#pragma omp barrier
// a barrier is needed in case some more code follow
// otherwise there is an implicit barrier at the end of the parallel region
}
The same principle should be applied to the second parallel loop, which suffer from the same issue of actually being entirely serialized by the critical
statement.
Since variablen
is not modified, I see no reason why it should be shared. Making it private may afford lighter-weight accesses by the various participating threads.
– John Bollinger
Nov 21 '18 at 14:58
Then it has to be tagged asfirstprivate(n)
so that the private copy within each thread is initialized with the value of the pre-existing variable. I do not think it will have much impact, though (with a static schedule, the master thread will access its value once then distribute the work among the threads which will then loop based on their own internal, private variables).
– Brice
Nov 21 '18 at 17:45
Tried to make the input subtantial enough by adding in these lines of code: Point points[3000]; int i; for(i=0;i<3000;i++) { points[i].x = rand()%100; points[i].y = rand()%100; int j; for(j=i+1;j<3000;j++) { if(points[i].x==points[j].x) { if(points[i].y==points[j].y) { i--; break; } } } } But it crashes sometimes
– Yeoh FS
Nov 25 '18 at 10:38
add a comment |
In the following piece of your code, the whole content of the parallel for
loop is wrapped into a critical
statement. This means that this part of the code will never be entered by more than on thread at a time. Having multiple threads work one at a time will not go faster than if a single thread had gone through all iterations. But on top of that some time is lost in synchronization overhead (each thread must acquire a mutex before entering the critical section and release it afterwards).
int l = 0,i;
#pragma omp parallel shared (n,l) private (i)
{
#pragma omp for
for (i = 1; i < n; i++)
{
#pragma omp critical
{
if (points[i].x < points[l].x)
l = i;
}
}
}
The serial code needs to be somewhat refactored for parallelization. Reduction is often a good approach for simple operations: have each thread compute a partial result on one part of the iterations (e.g. partial minimum, partial sum) than merge all the results into a global one. For supported operations, the #pragma omp for reduction(op:var)
syntax can be used. But in this case, the reduction has to be done manually.
See how the following code relies on local variables to track the index of minimum x
, then uses a single critical section to compute the global minimum index.
int l = 0,i;
#pragma omp parallel shared (n,l) private (i)
{
int l_local = 0; //This variable is private to the thread
#pragma omp for nowait
for (i = 1; i < n; i++)
{
// This part of the code can be executed in parallel
// since all write operations are on thread-local variables
if (points[i].x < points[l_local].x)
l_local = i;
}
// The critical section is entered only once by each thread
#pragma omp critical
{
if (points[l_local].x < points[l].x)
l = l_local;
}
#pragma omp barrier
// a barrier is needed in case some more code follow
// otherwise there is an implicit barrier at the end of the parallel region
}
The same principle should be applied to the second parallel loop, which suffer from the same issue of actually being entirely serialized by the critical
statement.
Since variablen
is not modified, I see no reason why it should be shared. Making it private may afford lighter-weight accesses by the various participating threads.
– John Bollinger
Nov 21 '18 at 14:58
Then it has to be tagged asfirstprivate(n)
so that the private copy within each thread is initialized with the value of the pre-existing variable. I do not think it will have much impact, though (with a static schedule, the master thread will access its value once then distribute the work among the threads which will then loop based on their own internal, private variables).
– Brice
Nov 21 '18 at 17:45
Tried to make the input subtantial enough by adding in these lines of code: Point points[3000]; int i; for(i=0;i<3000;i++) { points[i].x = rand()%100; points[i].y = rand()%100; int j; for(j=i+1;j<3000;j++) { if(points[i].x==points[j].x) { if(points[i].y==points[j].y) { i--; break; } } } } But it crashes sometimes
– Yeoh FS
Nov 25 '18 at 10:38
add a comment |
In the following piece of your code, the whole content of the parallel for
loop is wrapped into a critical
statement. This means that this part of the code will never be entered by more than on thread at a time. Having multiple threads work one at a time will not go faster than if a single thread had gone through all iterations. But on top of that some time is lost in synchronization overhead (each thread must acquire a mutex before entering the critical section and release it afterwards).
int l = 0,i;
#pragma omp parallel shared (n,l) private (i)
{
#pragma omp for
for (i = 1; i < n; i++)
{
#pragma omp critical
{
if (points[i].x < points[l].x)
l = i;
}
}
}
The serial code needs to be somewhat refactored for parallelization. Reduction is often a good approach for simple operations: have each thread compute a partial result on one part of the iterations (e.g. partial minimum, partial sum) than merge all the results into a global one. For supported operations, the #pragma omp for reduction(op:var)
syntax can be used. But in this case, the reduction has to be done manually.
See how the following code relies on local variables to track the index of minimum x
, then uses a single critical section to compute the global minimum index.
int l = 0,i;
#pragma omp parallel shared (n,l) private (i)
{
int l_local = 0; //This variable is private to the thread
#pragma omp for nowait
for (i = 1; i < n; i++)
{
// This part of the code can be executed in parallel
// since all write operations are on thread-local variables
if (points[i].x < points[l_local].x)
l_local = i;
}
// The critical section is entered only once by each thread
#pragma omp critical
{
if (points[l_local].x < points[l].x)
l = l_local;
}
#pragma omp barrier
// a barrier is needed in case some more code follow
// otherwise there is an implicit barrier at the end of the parallel region
}
The same principle should be applied to the second parallel loop, which suffer from the same issue of actually being entirely serialized by the critical
statement.
In the following piece of your code, the whole content of the parallel for
loop is wrapped into a critical
statement. This means that this part of the code will never be entered by more than on thread at a time. Having multiple threads work one at a time will not go faster than if a single thread had gone through all iterations. But on top of that some time is lost in synchronization overhead (each thread must acquire a mutex before entering the critical section and release it afterwards).
int l = 0,i;
#pragma omp parallel shared (n,l) private (i)
{
#pragma omp for
for (i = 1; i < n; i++)
{
#pragma omp critical
{
if (points[i].x < points[l].x)
l = i;
}
}
}
The serial code needs to be somewhat refactored for parallelization. Reduction is often a good approach for simple operations: have each thread compute a partial result on one part of the iterations (e.g. partial minimum, partial sum) than merge all the results into a global one. For supported operations, the #pragma omp for reduction(op:var)
syntax can be used. But in this case, the reduction has to be done manually.
See how the following code relies on local variables to track the index of minimum x
, then uses a single critical section to compute the global minimum index.
int l = 0,i;
#pragma omp parallel shared (n,l) private (i)
{
int l_local = 0; //This variable is private to the thread
#pragma omp for nowait
for (i = 1; i < n; i++)
{
// This part of the code can be executed in parallel
// since all write operations are on thread-local variables
if (points[i].x < points[l_local].x)
l_local = i;
}
// The critical section is entered only once by each thread
#pragma omp critical
{
if (points[l_local].x < points[l].x)
l = l_local;
}
#pragma omp barrier
// a barrier is needed in case some more code follow
// otherwise there is an implicit barrier at the end of the parallel region
}
The same principle should be applied to the second parallel loop, which suffer from the same issue of actually being entirely serialized by the critical
statement.
edited Nov 22 '18 at 9:47
answered Nov 21 '18 at 12:49
BriceBrice
1,400110
1,400110
Since variablen
is not modified, I see no reason why it should be shared. Making it private may afford lighter-weight accesses by the various participating threads.
– John Bollinger
Nov 21 '18 at 14:58
Then it has to be tagged asfirstprivate(n)
so that the private copy within each thread is initialized with the value of the pre-existing variable. I do not think it will have much impact, though (with a static schedule, the master thread will access its value once then distribute the work among the threads which will then loop based on their own internal, private variables).
– Brice
Nov 21 '18 at 17:45
Tried to make the input subtantial enough by adding in these lines of code: Point points[3000]; int i; for(i=0;i<3000;i++) { points[i].x = rand()%100; points[i].y = rand()%100; int j; for(j=i+1;j<3000;j++) { if(points[i].x==points[j].x) { if(points[i].y==points[j].y) { i--; break; } } } } But it crashes sometimes
– Yeoh FS
Nov 25 '18 at 10:38
add a comment |
Since variablen
is not modified, I see no reason why it should be shared. Making it private may afford lighter-weight accesses by the various participating threads.
– John Bollinger
Nov 21 '18 at 14:58
Then it has to be tagged asfirstprivate(n)
so that the private copy within each thread is initialized with the value of the pre-existing variable. I do not think it will have much impact, though (with a static schedule, the master thread will access its value once then distribute the work among the threads which will then loop based on their own internal, private variables).
– Brice
Nov 21 '18 at 17:45
Tried to make the input subtantial enough by adding in these lines of code: Point points[3000]; int i; for(i=0;i<3000;i++) { points[i].x = rand()%100; points[i].y = rand()%100; int j; for(j=i+1;j<3000;j++) { if(points[i].x==points[j].x) { if(points[i].y==points[j].y) { i--; break; } } } } But it crashes sometimes
– Yeoh FS
Nov 25 '18 at 10:38
Since variable
n
is not modified, I see no reason why it should be shared. Making it private may afford lighter-weight accesses by the various participating threads.– John Bollinger
Nov 21 '18 at 14:58
Since variable
n
is not modified, I see no reason why it should be shared. Making it private may afford lighter-weight accesses by the various participating threads.– John Bollinger
Nov 21 '18 at 14:58
Then it has to be tagged as
firstprivate(n)
so that the private copy within each thread is initialized with the value of the pre-existing variable. I do not think it will have much impact, though (with a static schedule, the master thread will access its value once then distribute the work among the threads which will then loop based on their own internal, private variables).– Brice
Nov 21 '18 at 17:45
Then it has to be tagged as
firstprivate(n)
so that the private copy within each thread is initialized with the value of the pre-existing variable. I do not think it will have much impact, though (with a static schedule, the master thread will access its value once then distribute the work among the threads which will then loop based on their own internal, private variables).– Brice
Nov 21 '18 at 17:45
Tried to make the input subtantial enough by adding in these lines of code: Point points[3000]; int i; for(i=0;i<3000;i++) { points[i].x = rand()%100; points[i].y = rand()%100; int j; for(j=i+1;j<3000;j++) { if(points[i].x==points[j].x) { if(points[i].y==points[j].y) { i--; break; } } } } But it crashes sometimes
– Yeoh FS
Nov 25 '18 at 10:38
Tried to make the input subtantial enough by adding in these lines of code: Point points[3000]; int i; for(i=0;i<3000;i++) { points[i].x = rand()%100; points[i].y = rand()%100; int j; for(j=i+1;j<3000;j++) { if(points[i].x==points[j].x) { if(points[i].y==points[j].y) { i--; break; } } } } But it crashes sometimes
– Yeoh FS
Nov 25 '18 at 10:38
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53410585%2fopenmp-c-program-run-slower-than-sequential-code%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
3
For small data sets parallelization may be slower. The cause is the overhead in creating and managing the threads.
– Osiris
Nov 21 '18 at 10:59
I suggest to try out another algorithm that you know is highly "parallelizable" or enlarge the dataset as suggest by @Osiris
– Welgriv
Nov 21 '18 at 11:08