Updated January 29, 2014, 4:42 pm

Calculating Terrain Normals

While porting some of the old terrain code to d3d11 I came upon the function that was calculating terrain normals for the mesh.

This code is probably untouched for more than 5 years and it wasn't really working fast. It doesn't matter if the normals are calculated on application load but for something like an editor where the terrain can be modified it is not good enough.

The function is below :

void GeoMipmapTerrain::computeNormals()
{
    if(!m_Vertices.size())
        return;

    STE::Profiler::begin(__FUNCTION__);
    
    //allocate for face normals first
    Vector3 *faceNormals = new Vector3 [(m_Size-1)*(m_Size-1)*2];

    Vector3 vVector1,vVector2;
    Vector3 vPoly[3];
    Vector3 vNormal;
    int i = 0;
    int k = 0;
    int index=0;
    int cur = 0;
    int iCache = 0;

    //compute face normals
    for(i=0;i<m_Size-1; ++i)
    {
        iCache = i*m_Size;
        for(k=0;k<m_Size-1; ++k)
        {
            cur = k+iCache;
            //first poly
            vPoly[0] = m_Vertices[cur+m_Size].v;
            vPoly[1] = m_Vertices[cur].v;
            vPoly[2] = m_Vertices[cur+m_Size+1].v;

            vVector1 = vPoly[0] - vPoly[2];
            vVector2 = vPoly[2] - vPoly[1];	

            STMath::Cross(vNormal,vVector2, vVector1);		
            STMath::Normalize(vNormal,vNormal);				

            faceNormals[index] = vNormal;					
            index++;
            //second poly
            vPoly[0] = m_Vertices[cur+m_Size+1].v;
            vPoly[1] = m_Vertices[cur].v;
            vPoly[2] = m_Vertices[cur+1].v;

            vVector1 = vPoly[0] - vPoly[2];
            vVector2 = vPoly[2] - vPoly[1];

            STMath::Cross(vNormal,vVector2, vVector1);		
            STMath::Normalize(vNormal,vNormal);				

            faceNormals[index] = vNormal;	
            index++;
        }
    }
    int tmpCache = 0;
    int shared = 0;
    int startz = 0;
    int startx = 0;

    Vector3 vSum = Vector3(0,0,0);
    int iVerticeCount = m_Size * m_Size;
    //compute vertex normals
    for(int z=0; z<iVerticeCount; ++z)
    {
        index=0;
        startz = (int)(z / m_Size)-1;
        if(startz<0) startz = 0;
       
        for(i=startz; i<m_Size-1; ++i)
        {
            tmpCache = i * m_Size;
            startx = (int)(z % m_Size)-1;
            if(startx<0) startx=0;

            for(k=startx; k<m_Size-1; ++k)
            {

                iCache = k + tmpCache;
                index = (k + i * (m_Size-1))<<1;
                if( iCache+m_Size	== z || iCache == z || iCache+m_Size+1== z)
                {
                    vSum += faceNormals[index];
                    shared++;
                }
                if( iCache+m_Size+1== z || iCache == z || iCache+1 == z){
                    vSum += faceNormals[index+1];
                   shared++;
                }
                if(shared>5)
                    break;
            }
            if(shared>5)
                break;
        }

        STMath::Normalize(m_Vertices[z].n,vSum);

        vSum = Vector3(0,0,0);
        shared = 0;
    }

    delete [] faceNormals;
    STE::Profiler::end(__FUNCTION__);
}

There are two sections in this function, the first part calculates all the face normals for each triangle in the terrain. Then the second part calculates the vertex normals by adding the face normals that are shared by a vertex and normalizing it. Since the terrain is a heightmap a vertex can be shared by a maximum of 6 faces. The shared>5 checks detect this to break out of the loops earlier.

For a heightmap that has 513*513 vertices this function takes 563.028ms on my computer, that means when the terrain is modified and this function is called there is half a second delay for the lighting to be correct again.

Now the new code :

void GeoMipmapTerrain::computeNormals()
{
    if(!_vertices.size())
        return;
		
    sten::Profiler::begin(__FUNCTION__);
	
    std::vector<DirectX::XMVECTOR> _tempNormals;
    _tempNormals.resize(_vertices.size(), XMVectorZero());

    XMVECTOR quadVertices[4];
    XMVECTOR normal;

    auto* normalsPtr = &_tempNormals[0];
    auto* verticesPtr = &_vertices[0];

    int index = 0;
    int cache = 0;
    _smallestNormalY = std::numeric_limits<float>::infinity();

    for(unsigned int i=0; i<_size-1; ++i)
    {
        cache = i * _size;

        for(unsigned int k=0; k<_size-1; ++k)
        {
            index = k + cache;

            quadVertices[0] = XMLoadFloat3(&verticesPtr[index + _size].position);
            quadVertices[1] = XMLoadFloat3(&verticesPtr[index].position);
            quadVertices[2] = XMLoadFloat3(&verticesPtr[index + _size + 1].position);
            quadVertices[3] = XMLoadFloat3(&verticesPtr[index + 1].position);

            XMVECTOR n1 = XMVectorSubtract(quadVertices[0], quadVertices[2]);
            XMVECTOR n2 = XMVectorSubtract(quadVertices[2], quadVertices[1]);
            XMVECTOR n3 = XMVectorSubtract(quadVertices[3], quadVertices[2]);

            normal = XMVector3Normalize(XMVector3Cross(n1, n2));

            normalsPtr[index + _size] = XMVectorAdd(normalsPtr[index + _size], normal);
            normalsPtr[index] = XMVectorAdd(normalsPtr[index], normal);
            normalsPtr[index + _size + 1] = XMVectorAdd(normalsPtr[index + _size + 1], normal);

            normal = XMVector3Normalize(XMVector3Cross(n2, n3));

            normalsPtr[index + _size + 1] = XMVectorAdd(normalsPtr[index + _size + 1], normal);
            normalsPtr[index] = XMVectorAdd(normalsPtr[index], normal);
            normalsPtr[index + 1] = XMVectorAdd(normalsPtr[index + 1], normal);
        }
    }

    unsigned int vertexCount = _vertices.size();

    for(unsigned int i=0; i<vertexCount; ++i)
    {
        XMStoreFloat3(&verticesPtr[i].normal, XMVector3Normalize(normalsPtr[i]));

        if(verticesPtr[i].normal.y < _smallestNormalY)
            _smallestNormalY = verticesPtr[i].normal.y;
    }

    sten::Profiler::end(__FUNCTION__);
}

Both functions do the same thing, but the second one is a lot faster. First of all the second loop is much simpler because the first loop that calculates face normals adds and stores the shared normals for each vertex in a vector. This completely eliminates the need to determine which face normals are shared by which vertex in the second loop. The second loop only goes through each normal in the vector and normalizes them.

This version of the function also uses the XMVECTOR types from the DirectXMath library. The Vector3 class used in the first code is just a class holding 3 floats and isn't SIMD friendly.

The first function was 563.028ms, the second one does it's job in just 10.64ms. That is ~56 times faster and makes the editing of the terrain a lot better since the lighting is updated almost instantly when the terrain changes.