Removing OpenMP

This commit finishes the removal of OpenMP from the KiCad codebase. Removed in this commit are the OpenMP calls in 3d-viewer and qa/polygon_triangulation as well as all references in CMakeLists.txt std::thread is used instead for multithreaded computation
2018-09-20 21:23:15 -07:00 · 2018-09-20 21:23:15 -07:00 · f8784f30a8
parent 21485e6f24
commit f8784f30a8
16 changed files with 905 additions and 881 deletions
--- a/3d-viewer/3d_canvas/create_3Dgraphic_brd_items.cpp
+++ b/3d-viewer/3d_canvas/create_3Dgraphic_brd_items.cpp
@ -40,7 +40,7 @@
 #include "../3d_rendering/3d_render_raytracing/accelerators/ccontainer2d.h"
 #include "../3d_rendering/3d_render_raytracing/shapes3D/ccylinder.h"
 #include "../3d_rendering/3d_render_raytracing/shapes3D/clayeritem.h"
-#include <openmp_mutex.h>
+
 #include <class_board.h>
 #include <class_module.h>
 #include <class_pad.h>
@ -845,11 +845,7 @@ void CINFO3D_VISU::AddSolidAreasShapesToContainer( const ZONE_CONTAINER* aZoneCo
                                                   PCB_LAYER_ID aLayerId )
 {
    // Copy the polys list because we have to simplify it
-    SHAPE_POLY_SET polyList = SHAPE_POLY_SET(aZoneContainer->GetFilledPolysList());
-    polyList.Simplify( SHAPE_POLY_SET::PM_FAST );
-
-    if( polyList.IsEmpty() )
-        return;
+    SHAPE_POLY_SET polyList = SHAPE_POLY_SET( aZoneContainer->GetFilledPolysList(), true );

    // This convert the poly in outline and holes
    Convert_shape_line_polygon_to_triangles( polyList,
--- a/3d-viewer/3d_canvas/create_layer_items.cpp
+++ b/3d-viewer/3d_canvas/create_layer_items.cpp
@ -40,7 +40,7 @@
 #include "../3d_rendering/3d_render_raytracing/accelerators/ccontainer2d.h"
 #include "../3d_rendering/3d_render_raytracing/shapes3D/ccylinder.h"
 #include "../3d_rendering/3d_render_raytracing/shapes3D/clayeritem.h"
-#include <openmp_mutex.h>
+
 #include <class_board.h>
 #include <class_module.h>
 #include <class_pad.h>
@ -52,6 +52,9 @@
 #include <trigo.h>
 #include <utility>
 #include <vector>
+#include <thread>
+#include <algorithm>
+#include <atomic>

 #include <profile.h>

@ -788,36 +791,43 @@ void CINFO3D_VISU::createLayers( REPORTER *aStatusTextReporter )

        // Add zones objects
        // /////////////////////////////////////////////////////////////////////
-        for( unsigned int lIdx = 0; lIdx < layer_id.size(); ++lIdx )
+        std::atomic<size_t> nextZone( 0 );
+        std::atomic<size_t> threadsFinished( 0 );
+
+        size_t parallelThreadCount = std::max<size_t>( std::thread::hardware_concurrency(), 2 );
+        for( size_t ii = 0; ii < parallelThreadCount; ++ii )
        {
-            const PCB_LAYER_ID curr_layer_id = layer_id[lIdx];
-
-            if( aStatusTextReporter )
-                aStatusTextReporter->Report( wxString::Format( _( "Create zones of layer %s" ),
-                                                               LSET::Name( curr_layer_id ) ) );
-
-            wxASSERT( m_layers_container2D.find( curr_layer_id ) != m_layers_container2D.end() );
-
-            CBVHCONTAINER2D *layerContainer = m_layers_container2D[curr_layer_id];
-
-            // ADD COPPER ZONES
-            for( int ii = 0; ii < m_board->GetAreaCount(); ++ii )
+            std::thread t = std::thread( [&]()
            {
-                const ZONE_CONTAINER* zone = m_board->GetArea( ii );
-                const PCB_LAYER_ID zonelayer = zone->GetLayer();
-
-                if( zonelayer == curr_layer_id )
+                for( size_t areaId = nextZone.fetch_add( 1 );
+                            areaId < static_cast<size_t>( m_board->GetAreaCount() );
+                            areaId = nextZone.fetch_add( 1 ) )
                {
-                    AddSolidAreasShapesToContainer( zone,
-                                                    layerContainer,
-                                                    curr_layer_id );
-                }
+                    const ZONE_CONTAINER* zone = m_board->GetArea( areaId );
+
+                    if( zone == nullptr )
+                        break;
+
+                    auto layerContainer = m_layers_container2D.find( zone->GetLayer() );
+
+                    if( layerContainer != m_layers_container2D.end() )
+                        AddSolidAreasShapesToContainer( zone, layerContainer->second,
+                                                        zone->GetLayer() );
                }
+
+                threadsFinished++;
+            } );
+
+            t.detach();
        }
+
+        while( threadsFinished < parallelThreadCount )
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
    }

 #ifdef PRINT_STATISTICS_3D_VIEWER
-    printf( "T13: %.3f ms\n", (float)( GetRunningMicroSecs()  - start_Time  ) / 1e3 );
+    printf( "fill zones T13: %.3f ms\n", (float)( GetRunningMicroSecs()  - start_Time  ) / 1e3 );
    start_Time = GetRunningMicroSecs();
 #endif

@ -825,29 +835,18 @@ void CINFO3D_VISU::createLayers( REPORTER *aStatusTextReporter )
        GetFlag( FL_RENDER_OPENGL_COPPER_THICKNESS ) &&
        (m_render_engine == RENDER_ENGINE_OPENGL_LEGACY) )
    {
-        // Add zones poly contourns
-        // /////////////////////////////////////////////////////////////////////
-        for( unsigned int lIdx = 0; lIdx < layer_id.size(); ++lIdx )
-        {
-            const PCB_LAYER_ID curr_layer_id = layer_id[lIdx];
-
-            wxASSERT( m_layers_poly.find( curr_layer_id ) != m_layers_poly.end() );
-
-            SHAPE_POLY_SET *layerPoly = m_layers_poly[curr_layer_id];
-
        // ADD COPPER ZONES
        for( int ii = 0; ii < m_board->GetAreaCount(); ++ii )
        {
            const ZONE_CONTAINER* zone = m_board->GetArea( ii );
-                const LAYER_NUM zonelayer = zone->GetLayer();

-                if( zonelayer == curr_layer_id )
-                {
-                    zone->TransformSolidAreasShapesToPolygonSet( *layerPoly,
-                                                                 segcountforcircle,
-                                                                 correctionFactor );
-                }
-            }
+            if( zone == nullptr )
+                break;
+
+            auto layerContainer = m_layers_poly.find( zone->GetLayer() );
+
+            if( layerContainer != m_layers_poly.end() )
+                zone->TransformSolidAreasShapesToPolygonSet( *layerContainer->second, segcountforcircle, correctionFactor );
        }
    }

@ -865,22 +864,35 @@ void CINFO3D_VISU::createLayers( REPORTER *aStatusTextReporter )
    if( GetFlag( FL_RENDER_OPENGL_COPPER_THICKNESS ) &&
        (m_render_engine == RENDER_ENGINE_OPENGL_LEGACY) )
    {
-        const int nLayers = layer_id.size();
+        std::atomic<size_t> nextItem( 0 );
+        std::atomic<size_t> threadsFinished( 0 );

-        #pragma omp parallel for
-        for( signed int lIdx = 0; lIdx < nLayers; ++lIdx )
+        size_t parallelThreadCount = std::min<size_t>(
+                std::max<size_t>( std::thread::hardware_concurrency(), 2 ),
+                layer_id.size() );
+        for( size_t ii = 0; ii < parallelThreadCount; ++ii )
        {
-            const PCB_LAYER_ID curr_layer_id = layer_id[lIdx];
+            std::thread t = std::thread( [&nextItem, &threadsFinished, &layer_id, this]()
+            {
+                for( size_t i = nextItem.fetch_add( 1 );
+                            i < layer_id.size();
+                            i = nextItem.fetch_add( 1 ) )
+                {
+                    auto layerPoly = m_layers_poly.find( layer_id[i] );

-            wxASSERT( m_layers_poly.find( curr_layer_id ) != m_layers_poly.end() );
-
-            SHAPE_POLY_SET *layerPoly = m_layers_poly[curr_layer_id];
-
-            wxASSERT( layerPoly != NULL );
-
-            // This will make a union of all added contourns
-            layerPoly->Simplify( SHAPE_POLY_SET::PM_FAST );
+                    if( layerPoly != m_layers_poly.end() )
+                        // This will make a union of all added contours
+                        layerPoly->second->Simplify( SHAPE_POLY_SET::PM_FAST );
                }
+
+                threadsFinished++;
+            } );
+
+            t.detach();
+        }
+
+        while( threadsFinished < parallelThreadCount )
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
    }

 #ifdef PRINT_STATISTICS_3D_VIEWER
--- a/3d-viewer/3d_rendering/3d_render_ogl_legacy/clayer_triangles.cpp
+++ b/3d-viewer/3d_rendering/3d_render_ogl_legacy/clayer_triangles.cpp
@ -30,6 +30,9 @@

 #include "clayer_triangles.h"
 #include <wx/debug.h>   // For the wxASSERT
+#include <mutex>
+#include <thread>
+#include <atomic>


 CLAYER_TRIANGLE_CONTAINER::CLAYER_TRIANGLE_CONTAINER( unsigned int aNrReservedTriangles,
@ -219,8 +222,8 @@ void CLAYER_TRIANGLES::AddToMiddleContourns( const std::vector< SFVEC2F > &aCont
            const SFVEC2F &v0 = aContournPoints[i + 0];
            const SFVEC2F &v1 = aContournPoints[i + 1];

-            #pragma omp critical
            {
+                std::lock_guard<std::mutex> lock( m_middle_layer_lock );
                m_layer_middle_contourns_quads->AddQuad( SFVEC3F( v0.x, v0.y, zTop ),
                                                         SFVEC3F( v1.x, v1.y, zTop ),
                                                         SFVEC3F( v1.x, v1.y, zBot ),
@ -305,8 +308,19 @@ void CLAYER_TRIANGLES::AddToMiddleContourns( const SHAPE_POLY_SET &aPolySet,
    m_layer_middle_contourns_quads->Reserve_More( nrContournPointsToReserve * 2,
                                                  true );

-    #pragma omp parallel for
-    for( signed int i = 0; i < aPolySet.OutlineCount(); ++i )
+    std::atomic<int> nextItem( 0 );
+    std::atomic<size_t> threadsFinished( 0 );
+
+    size_t parallelThreadCount = std::min<size_t>(
+            std::max<size_t>( std::thread::hardware_concurrency(), 2 ),
+            static_cast<size_t>( aPolySet.OutlineCount() ) );
+    for( size_t ii = 0; ii < parallelThreadCount; ++ii )
+    {
+        std::thread t = std::thread( [&]()
+        {
+            for( int i = nextItem.fetch_add( 1 );
+                     i < aPolySet.OutlineCount();
+                     i = nextItem.fetch_add( 1 ) )
            {
                // Add outline
                const SHAPE_LINE_CHAIN& pathOutline = aPolySet.COutline( i );
@ -320,6 +334,15 @@ void CLAYER_TRIANGLES::AddToMiddleContourns( const SHAPE_POLY_SET &aPolySet,
                    AddToMiddleContourns( hole, zBot, zTop, aBiuTo3Du, aInvertFaceDirection );
                }
            }
+
+            threadsFinished++;
+        } );
+
+        t.detach();
+    }
+
+    while( threadsFinished < parallelThreadCount )
+        std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
 }


--- a/3d-viewer/3d_rendering/3d_render_ogl_legacy/clayer_triangles.h
+++ b/3d-viewer/3d_rendering/3d_render_ogl_legacy/clayer_triangles.h
@ -35,6 +35,7 @@
 #include <geometry/shape_line_chain.h>
 #include <geometry/shape_poly_set.h>
 #include <vector>
+#include <mutex>


 typedef std::vector< SFVEC3F > SFVEC3F_VECTOR;
@ -174,6 +175,8 @@ public:
                               float zTop,
                               bool aInvertFaceDirection );

+    std::mutex m_middle_layer_lock;
+
    CLAYER_TRIANGLE_CONTAINER *m_layer_top_segment_ends;
    CLAYER_TRIANGLE_CONTAINER *m_layer_top_triangles;
    CLAYER_TRIANGLE_CONTAINER *m_layer_middle_contourns_quads;
--- a/3d-viewer/3d_rendering/3d_render_raytracing/accelerators/ccontainer2d.cpp
+++ b/3d-viewer/3d_rendering/3d_render_raytracing/accelerators/ccontainer2d.cpp
@ -29,6 +29,7 @@

 #include "ccontainer2d.h"
 #include <vector>
+#include <mutex>
 #include <boost/range/algorithm/partition.hpp>
 #include <boost/range/algorithm/nth_element.hpp>
 #include <wx/debug.h>
@ -46,6 +47,7 @@ CGENERICCONTAINER2D::CGENERICCONTAINER2D( OBJECT2D_TYPE aObjType )

 void CGENERICCONTAINER2D::Clear()
 {
+    std::lock_guard<std::mutex> lock( m_lock );
    m_bbox.Reset();

    for( LIST_OBJECT2D::iterator ii = m_objects.begin();
--- a/3d-viewer/3d_rendering/3d_render_raytracing/accelerators/ccontainer2d.h
+++ b/3d-viewer/3d_rendering/3d_render_raytracing/accelerators/ccontainer2d.h
@ -32,6 +32,7 @@

 #include "../shapes2D/cobject2d.h"
 #include <list>
+#include <mutex>

 typedef std::list<COBJECT2D *> LIST_OBJECT2D;
 typedef std::list<const COBJECT2D *> CONST_LIST_OBJECT2D;
@ -52,6 +53,7 @@ public:
    {
        if( aObject ) // Only add if it is a valid pointer
        {
+            std::lock_guard<std::mutex> lock( m_lock );
            m_objects.push_back( aObject );
            m_bbox.Union( aObject->GetBBox() );
        }
@ -70,6 +72,7 @@ public:
                                           CONST_LIST_OBJECT2D &aOutList ) const = 0;

 private:
+    std::mutex m_lock;
 };


--- a/3d-viewer/3d_rendering/3d_render_raytracing/c3d_render_raytracing.cpp
+++ b/3d-viewer/3d_rendering/3d_render_raytracing/c3d_render_raytracing.cpp
@ -29,6 +29,9 @@

 #include <GL/glew.h>
 #include <climits>
+#include <atomic>
+#include <thread>
+#include <chrono>

 #include "c3d_render_raytracing.h"
 #include "mortoncodes.h"
@ -42,10 +45,6 @@
 // convertLinearToSRGB
 //#include <glm/gtc/color_space.hpp>

-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
 C3D_RENDER_RAYTRACING::C3D_RENDER_RAYTRACING( CINFO3D_VISU &aSettings ) :
                       C3D_RENDER_BASE( aSettings ),
                       m_postshader_ssao( aSettings.CameraGet() )
@ -137,7 +136,7 @@ void C3D_RENDER_RAYTRACING::restart_render_state()
    // Mark the blocks not processed yet
    std::fill( m_blockPositionsWasProcessed.begin(),
               m_blockPositionsWasProcessed.end(),
-               false );
+               0 );
 }


@ -364,61 +363,58 @@ void C3D_RENDER_RAYTRACING::rt_render_tracing( GLubyte *ptrPBO ,
                                               REPORTER *aStatusTextReporter )
 {
    m_isPreview = false;
-    wxASSERT( m_blockPositions.size() <= LONG_MAX );

-    const long nrBlocks = (long) m_blockPositions.size();
-    const unsigned startTime = GetRunningMicroSecs();
+    auto startTime = std::chrono::steady_clock::now();
    bool breakLoop = false;
-    int numBlocksRendered = 0;

-    #pragma omp parallel for schedule(dynamic) shared(breakLoop) \
-        firstprivate(ptrPBO) reduction(+:numBlocksRendered) default(none)
-    for( long iBlock = 0; iBlock < nrBlocks; iBlock++ )
+    std::atomic<size_t> numBlocksRendered( 0 );
+    std::atomic<size_t> currentBlock( 0 );
+    std::atomic<size_t> threadsFinished( 0 );
+
+    size_t parallelThreadCount = std::min<size_t>(
+            std::max<size_t>( std::thread::hardware_concurrency(), 2 ),
+            m_blockPositions.size() );
+    for( size_t ii = 0; ii < parallelThreadCount; ++ii )
    {
-
-        #pragma omp flush(breakLoop)
-        if( !breakLoop )
+        std::thread t = std::thread( [&]()
        {
-            bool process_block;
-
-            // std::vector<bool> stuffs eight bools to each byte, so access to
-            // them can never be natively atomic.
-            #pragma omp critical(checkProcessBlock)
+            for( size_t iBlock = currentBlock.fetch_add( 1 );
+                        iBlock < m_blockPositions.size() && !breakLoop;
+                        iBlock = currentBlock.fetch_add( 1 ) )
            {
-                process_block = !m_blockPositionsWasProcessed[iBlock];
-                m_blockPositionsWasProcessed[iBlock] = true;
-            }
-
-            if( process_block )
+                if( !m_blockPositionsWasProcessed[iBlock] )
                {
                    rt_render_trace_block( ptrPBO, iBlock );
                    numBlocksRendered++;
-
+                    m_blockPositionsWasProcessed[iBlock] = 1;

                    // Check if it spend already some time render and request to exit
                    // to display the progress
-                #ifdef _OPENMP
-                if( omp_get_thread_num() == 0 )
-                #endif
-                    if( (GetRunningMicroSecs() - startTime) > 150000 )
-                    {
+                    if( std::chrono::duration_cast<std::chrono::milliseconds>(
+                            std::chrono::steady_clock::now() - startTime ).count() > 150 )
                        breakLoop = true;
-                        #pragma omp flush(breakLoop)
-                    }
                }
            }
+
+            threadsFinished++;
+        } );
+
+        t.detach();
    }

+    while( threadsFinished < parallelThreadCount )
+        std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
    m_nrBlocksRenderProgress += numBlocksRendered;

    if( aStatusTextReporter )
        aStatusTextReporter->Report( wxString::Format( _( "Rendering: %.0f %%" ),
                                                       (float)(m_nrBlocksRenderProgress * 100) /
-                                                       (float)nrBlocks ) );
+                                                       (float)m_blockPositions.size() ) );

    // Check if it finish the rendering and if should continue to a post processing
    // or mark it as finished
-    if( m_nrBlocksRenderProgress >= nrBlocks )
+    if( m_nrBlocksRenderProgress >= m_blockPositions.size() )
    {
        if( m_settings.GetFlag( FL_RENDER_RAYTRACING_POST_PROCESSING ) )
            m_rt_render_state = RT_RENDER_STATE_POST_PROCESS_SHADE;
@ -925,9 +921,17 @@ void C3D_RENDER_RAYTRACING::rt_render_post_process_shade( GLubyte *ptrPBO,
        if( aStatusTextReporter )
            aStatusTextReporter->Report( _("Rendering: Post processing shader") );

-        // Compute the shader value
-        #pragma omp parallel for schedule(dynamic)
-        for( signed int y = 0; y < (int)m_realBufferSize.y; ++y )
+        std::atomic<size_t> nextBlock( 0 );
+        std::atomic<size_t> threadsFinished( 0 );
+
+        size_t parallelThreadCount = std::max<size_t>( std::thread::hardware_concurrency(), 2 );
+        for( size_t ii = 0; ii < parallelThreadCount; ++ii )
+        {
+            std::thread t = std::thread( [&]()
+            {
+                for( size_t y = nextBlock.fetch_add( 1 );
+                            y < m_realBufferSize.y;
+                            y = nextBlock.fetch_add( 1 ) )
                {
                    SFVEC3F *ptr = &m_shaderBuffer[ y * m_realBufferSize.x ];

@ -938,8 +942,14 @@ void C3D_RENDER_RAYTRACING::rt_render_post_process_shade( GLubyte *ptrPBO,
                    }
                }

-        // Wait for all threads to finish
-        #pragma omp barrier
+                threadsFinished++;
+            } );
+
+            t.detach();
+        }
+
+        while( threadsFinished < parallelThreadCount )
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );

        // Set next state
        m_rt_render_state = RT_RENDER_STATE_POST_PROCESS_BLUR_AND_FINISH;
@ -960,8 +970,17 @@ void C3D_RENDER_RAYTRACING::rt_render_post_process_blur_finish( GLubyte *ptrPBO,
    if( m_settings.GetFlag( FL_RENDER_RAYTRACING_POST_PROCESSING ) )
    {
        // Now blurs the shader result and compute the final color
-        #pragma omp parallel for schedule(dynamic)
-        for( signed int y = 0; y < (int)m_realBufferSize.y; ++y )
+        std::atomic<size_t> nextBlock( 0 );
+        std::atomic<size_t> threadsFinished( 0 );
+
+        size_t parallelThreadCount = std::max<size_t>( std::thread::hardware_concurrency(), 2 );
+        for( size_t ii = 0; ii < parallelThreadCount; ++ii )
+        {
+            std::thread t = std::thread( [&]()
+            {
+                for( size_t y = nextBlock.fetch_add( 1 );
+                            y < m_realBufferSize.y;
+                            y = nextBlock.fetch_add( 1 ) )
                {
                    GLubyte *ptr = &ptrPBO[ y * m_realBufferSize.x * 4 ];

@ -980,8 +999,8 @@ void C3D_RENDER_RAYTRACING::rt_render_post_process_blur_finish( GLubyte *ptrPBO,

                    for( signed int x = 0; x < (int)m_realBufferSize.x; ++x )
                    {
-// This #if should be 1, it is here that can be used for debug proposes during development
-#if 1
+        // This #if should be 1, it is here that can be used for debug proposes during development
+        #if 1
                        int idx = x > 1 ? -2 : 0;
                        SFVEC3F bluredShadeColor = ptrShaderY0[idx] * 1.0f / 273.0f +
                                                   ptrShaderY1[idx] * 4.0f / 273.0f +
@ -1023,19 +1042,19 @@ void C3D_RENDER_RAYTRACING::rt_render_post_process_blur_finish( GLubyte *ptrPBO,
                        ++ptrShaderY3;
                        ++ptrShaderY4;

-#ifdef USE_SRGB_SPACE
+        #ifdef USE_SRGB_SPACE
                        const SFVEC3F originColor = convertLinearToSRGB( m_postshader_ssao.GetColorAtNotProtected( SFVEC2I( x,y ) ) );
-#else
+        #else
                        const SFVEC3F originColor = m_postshader_ssao.GetColorAtNotProtected( SFVEC2I( x,y ) );
-#endif
+        #endif

                        const SFVEC3F shadedColor = m_postshader_ssao.ApplyShadeColor( SFVEC2I( x,y ), originColor, bluredShadeColor );
-#else
+        #else
                        // Debug code
                        //const SFVEC3F shadedColor =  SFVEC3F( 1.0f ) -
                        //                             m_shaderBuffer[ y * m_realBufferSize.x + x];
                        const SFVEC3F shadedColor =  m_shaderBuffer[ y * m_realBufferSize.x + x ];
-#endif
+        #endif

                        rt_final_color( ptr, shadedColor, false );

@ -1043,8 +1062,15 @@ void C3D_RENDER_RAYTRACING::rt_render_post_process_blur_finish( GLubyte *ptrPBO,
                    }
                }

-        // Wait for all threads to finish
-        #pragma omp barrier
+                threadsFinished++;
+            } );
+
+            t.detach();
+        }
+
+        while( threadsFinished < parallelThreadCount )
+            std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+

        // Debug code
        //m_postshader_ssao.DebugBuffersOutputAsImages();
@ -1059,10 +1085,19 @@ void C3D_RENDER_RAYTRACING::render_preview( GLubyte *ptrPBO )
 {
    m_isPreview = true;

-    unsigned int nrBlocks = m_blockPositionsFast.size();
+    std::atomic<size_t> nextBlock( 0 );
+    std::atomic<size_t> threadsFinished( 0 );

-    #pragma omp parallel for schedule(dynamic)
-    for( signed int iBlock = 0; iBlock < (int)nrBlocks; iBlock++ )
+    size_t parallelThreadCount = std::min<size_t>(
+            std::max<size_t>( std::thread::hardware_concurrency(), 2 ),
+            m_blockPositions.size() );
+    for( size_t ii = 0; ii < parallelThreadCount; ++ii )
+    {
+        std::thread t = std::thread( [&]()
+        {
+            for( size_t iBlock = nextBlock.fetch_add( 1 );
+                        iBlock < m_blockPositionsFast.size();
+                        iBlock = nextBlock.fetch_add( 1 ) )
            {
                const SFVEC2UI &windowPosUI = m_blockPositionsFast[ iBlock ];
                const SFVEC2I windowsPos = SFVEC2I( windowPosUI.x + m_xoffset,
@ -1665,8 +1700,14 @@ void C3D_RENDER_RAYTRACING::render_preview( GLubyte *ptrPBO )
                }
            }

-    // Wait for all threads to finish (not sure if this is need)
-    #pragma omp barrier
+            threadsFinished++;
+        } );
+
+        t.detach();
+    }
+
+    while( threadsFinished < parallelThreadCount )
+        std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
 }


--- a/3d-viewer/3d_rendering/3d_render_raytracing/c3d_render_raytracing.h
+++ b/3d-viewer/3d_rendering/3d_render_raytracing/c3d_render_raytracing.h
@ -131,7 +131,7 @@ private:
    unsigned long int m_stats_start_rendering_time;

    /// Save the number of blocks progress of the render
-    long m_nrBlocksRenderProgress;
+    size_t m_nrBlocksRenderProgress;

    CPOSTSHADER_SSAO m_postshader_ssao;

@ -165,7 +165,7 @@ private:
    std::vector< SFVEC2UI > m_blockPositions;

    /// this flags if a position was already processed (cleared each new render)
-    std::vector< bool > m_blockPositionsWasProcessed;
+    std::vector< int > m_blockPositionsWasProcessed;

    /// this encodes the Morton code positions (on fast preview mode)
    std::vector< SFVEC2UI > m_blockPositionsFast;
--- a/3d-viewer/3d_rendering/cimage.cpp
+++ b/3d-viewer/3d_rendering/cimage.cpp
@ -31,6 +31,10 @@
 #include "buffers_debug.h"
 #include <string.h> // For memcpy

+#include <atomic>
+#include <thread>
+#include <chrono>
+
 #ifndef CLAMP
 #define CLAMP(n, min, max) {if( n < min ) n=min; else if( n > max ) n = max;}
 #endif
@ -469,16 +473,26 @@ void CIMAGE::EfxFilter( CIMAGE *aInImg, E_FILTER aFilterType )
    aInImg->m_wraping = WRAP_CLAMP;
    m_wraping = WRAP_CLAMP;

-    #pragma omp parallel for
-    for( int iy = 0; iy < (int)m_height; iy++)
+    std::atomic<size_t> nextRow( 0 );
+    std::atomic<size_t> threadsFinished( 0 );
+
+    size_t parallelThreadCount = std::max<size_t>( std::thread::hardware_concurrency(), 2 );
+
+    for( size_t ii = 0; ii < parallelThreadCount; ++ii )
    {
-        for( int ix = 0; ix < (int)m_width; ix++ )
+        std::thread t = std::thread( [&]()
+        {
+            for( size_t iy = nextRow.fetch_add( 1 );
+                        iy < m_height;
+                        iy = nextRow.fetch_add( 1 ) )
+            {
+                for( size_t ix = 0; ix < m_width; ix++ )
                {
                    int v = 0;

-            for( int sy = 0; sy < 5; sy++ )
+                    for( size_t sy = 0; sy < 5; sy++ )
                    {
-                for( int sx = 0; sx < 5; sx++ )
+                        for( size_t sx = 0; sx < 5; sx++ )
                        {
                            int factor = filter.kernel[sx][sy];
                            unsigned char pixelv = aInImg->Getpixel( ix + sx - 2,
@ -489,14 +503,21 @@ void CIMAGE::EfxFilter( CIMAGE *aInImg, E_FILTER aFilterType )
                    }

                    v /= filter.div;
-
                    v += filter.offset;
-
                    CLAMP(v, 0, 255);
-
+                    //TODO: This needs to write to a separate buffer
                    m_pixels[ix + iy * m_width] = v;
                }
            }
+
+            threadsFinished++;
+        } );
+
+        t.detach();
+    }
+
+    while( threadsFinished < parallelThreadCount )
+        std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
 }


--- a/3d-viewer/openmp_mutex.h
+++ b/3d-viewer/openmp_mutex.h
@ -1,81 +0,0 @@
-/*
- * This program source code file is part of KiCad, a free EDA CAD application.
- *
- * Copyright (C) 2016 Mario Luzeiro <mrluzeiro@ua.pt>
- * Copyright (C) 1992-2016 KiCad Developers, see AUTHORS.txt for contributors.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you may find one here:
- * http://www.gnu.org/licenses/old-licenses/gpl-2.0.html
- * or you may search the http://www.gnu.org website for the version 2 license,
- * or you may write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
- */
-
-
-/**
- * @file  openmp_mutex.h
- * @brief a mutex for openmp got from the website:
- * http://bisqwit.iki.fi/story/howto/openmp/
- * by Joel Yliluoma <bisqwit@iki.fi>
- */
-
-#ifndef _OPENMP_MUTEX_H
-#define _OPENMP_MUTEX_H
-
-#ifdef _OPENMP
-
-# include <omp.h>
-
-struct MutexType
-{
-    MutexType() { omp_init_lock( &lock ); }
-    ~MutexType() { omp_destroy_lock( &lock ); }
-    void Lock() { omp_set_lock( &lock ); }
-    void Unlock() { omp_unset_lock( &lock ); }
-
-    MutexType( const MutexType& ) { omp_init_lock( &lock ); }
-    MutexType& operator= ( const MutexType& ) { return *this; }
-public:
-    omp_lock_t lock;
-};
-
-#else
-
-/// A dummy mutex that doesn't actually exclude anything,
-/// but as there is no parallelism either, no worries.
-struct MutexType
-{
-    void Lock() {}
-    void Unlock() {}
-};
-#endif
-
-/// An exception-safe scoped lock-keeper.
-struct ScopedLock
-{
-    explicit ScopedLock( MutexType& m ) : mut( m ), locked( true ) { mut.Lock(); }
-    ~ScopedLock() { Unlock(); }
-    void Unlock() { if( !locked ) return; locked = false; mut.Unlock(); }
-    void LockAgain() { if( locked ) return; mut.Lock(); locked = true; }
-
-private:
-    MutexType& mut;
-    bool locked;
-
-private: // prevent copying the scoped lock.
-    void operator=(const ScopedLock&);
-    ScopedLock(const ScopedLock&);
-};
-
-#endif // _OPENMP_MUTEX_H
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -543,24 +543,6 @@ include( ExternalProject )
 #================================================
 include( CheckFindPackageResult )

-#
-# Find OpenMP support, optional
-#
-
-find_package( OpenMP )
-
-if( OPENMP_FOUND )
-    set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
-    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" )
-    add_definitions( -DUSE_OPENMP )
-
-    # MinGW does not include the OpenMP link library and FindOpenMP.cmake does not
-    # set it either.  Not sure this is the most elegant solution but it works.
-    if( MINGW )
-        set( OPENMP_LIBRARIES gomp )
-    endif()
-endif()
-
 #
 # Find wxWidgets library, required
 #
--- a/common/geometry/shape_poly_set.cpp
+++ b/common/geometry/shape_poly_set.cpp
@ -52,9 +52,18 @@ SHAPE_POLY_SET::SHAPE_POLY_SET() :
 }


-SHAPE_POLY_SET::SHAPE_POLY_SET( const SHAPE_POLY_SET& aOther ) :
+SHAPE_POLY_SET::SHAPE_POLY_SET( const SHAPE_POLY_SET& aOther, bool aDeepCopy ) :
    SHAPE( SH_POLY_SET ), m_polys( aOther.m_polys )
 {
+    if( aOther.IsTriangulationUpToDate() )
+    {
+        for( unsigned i = 0; i < aOther.TriangulatedPolyCount(); i++ )
+            m_triangulatedPolys.push_back(
+                    std::make_unique<TRIANGULATED_POLYGON>( *aOther.TriangulatedPolygon( i ) ) );
+
+        m_hash = aOther.GetHash();
+        m_triangulationValid = true;
+    }
 }


--- a/cvpcb/CMakeLists.txt
+++ b/cvpcb/CMakeLists.txt
@ -151,7 +151,6 @@ target_link_libraries( cvpcb_kiface
    gal
    ${wxWidgets_LIBRARIES}
    ${GDI_PLUS_LIBRARIES}
-    ${OPENMP_LIBRARIES}         # used by 3d viewer
    )

 if( BUILD_GITHUB_PLUGIN )
--- a/include/geometry/shape_poly_set.h
+++ b/include/geometry/shape_poly_set.h
@ -424,8 +424,9 @@ class SHAPE_POLY_SET : public SHAPE
         * Copy constructor SHAPE_POLY_SET
         * Performs a deep copy of \p aOther into \p this.
         * @param aOther is the SHAPE_POLY_SET object that will be copied.
+         * @param aDeepCopy if true, make new copies of the triangulated unique_ptr vector
         */
-        SHAPE_POLY_SET( const SHAPE_POLY_SET& aOther );
+        SHAPE_POLY_SET( const SHAPE_POLY_SET& aOther, bool aDeepCopy = false );

        ~SHAPE_POLY_SET();

--- a/pcbnew/CMakeLists.txt
+++ b/pcbnew/CMakeLists.txt
@ -652,12 +652,6 @@ if ( KICAD_BUILD_TESTS )

 endif ()

-if( ${OPENMP_FOUND} )
-    set_target_properties( pcbnew_kiface PROPERTIES
-        COMPILE_FLAGS   ${OpenMP_CXX_FLAGS}
-        )
-endif()
-
 set( PCBNEW_KIFACE_LIBRARIES
    3d-viewer
    pcbcommon
@ -675,7 +669,6 @@ set( PCBNEW_KIFACE_LIBRARIES
    ${PYTHON_LIBRARIES}
    ${Boost_LIBRARIES}      # must follow GITHUB
    ${PCBNEW_EXTRA_LIBS}    # -lrt must follow Boost
-    ${OPENMP_LIBRARIES}
    )


--- a/qa/polygon_triangulation/test_polygon_triangulation.cpp
+++ b/qa/polygon_triangulation/test_polygon_triangulation.cpp
@ -32,6 +32,7 @@
 #include <class_zone.h>
 #include <profile.h>

+#include <atomic>
 #include <thread>
 #include <unordered_set>
 #include <utility>
@ -229,17 +230,26 @@ int main( int argc, char *argv[] )
    PROF_COUNTER cnt( "allBoard" );


-    #pragma omp parallel for schedule(dynamic)
-    for( int z = 0; z<brd->GetAreaCount(); z++ )
+    std::atomic<size_t> zonesToTriangulate( 0 );
+    std::atomic<size_t> threadsFinished( 0 );
+
+    size_t parallelThreadCount = std::max<size_t>( std::thread::hardware_concurrency(), 2 );
+    for( size_t ii = 0; ii < parallelThreadCount; ++ii )
    {
-        auto zone = brd->GetArea( z );
+        std::thread t = std::thread( [brd, &zonesToTriangulate, &threadsFinished] ()
+        {
+            for( size_t areaId = zonesToTriangulate.fetch_add( 1 );
+                        areaId < static_cast<size_t>( brd->GetAreaCount() );
+                        areaId = zonesToTriangulate.fetch_add( 1 ) )
+            {
+                auto zone = brd->GetArea( areaId );
                SHAPE_POLY_SET poly = zone->GetFilledPolysList();

                poly.CacheTriangulation();

                (void) poly;
-        printf("zone %d/%d\n", ( z+1 ), brd->GetAreaCount() );
-#if 0
+                printf("zone %zu/%d\n", ( areaId + 1 ), brd->GetAreaCount() );
+        #if 0
                PROF_COUNTER unfrac("unfrac");
                poly.Unfracture( SHAPE_POLY_SET::PM_FAST );
                unfrac.Show();
@ -251,9 +261,19 @@ int main( int argc, char *argv[] )
                    poly.triangulatePoly( &poly.Polygon(i) );
                }
                triangulate.Show();
-#endif
+        #endif
            }

+            threadsFinished++;
+        } );
+
+        t.detach();
+    }
+
+    while( threadsFinished < parallelThreadCount )
+        std::this_thread::sleep_for( std::chrono::milliseconds( 10 ) );
+
+
    cnt.Show();

    delete brd;