Background

测试用PyTorch3D生成的Mesh时,打算用VTK对Mesh做离屏渲染生成图片,结果发现VTK的python binding和C++库都会在vtkOBJReader::Update时崩溃。由于是AI模型生成的OBJ文件,OBJ本身是有可能不太标准的,但用Blender、Open3D、trimesh测试,发现都可以加载该文件。看起来似乎VTK的vtkOBJReader实现不够健壮,遂决定调试一番。

Debug

以导致问题的OBJ文件编写复现demo,目录结构:

$ tree .
.
├── assets
│   ├── rand_0_diffuse.png
│   ├── rand_0_normal.png
│   ├── rand_0_skin.mtl
│   ├── rand_0_skin.obj
│   └── rand_0_spec.png
├── CMakeLists.txt
└── main.cpp

main.cpp

#include <vtkOBJReader.h>

int main()
{
    vtkNew<vtkOBJReader> reader;
    reader->SetFileName("rand_0_skin.obj");
    reader->Update();
    return 0;
}

崩溃堆栈:

1  vtkAOSDataArrayTemplate<float>::GetTuple         vtkAOSDataArrayTemplate.txx          275  0x7ffff6ee05e5 
2  vtkOBJReader::RequestData                        vtkOBJReader.cxx                     978  0x7ffff7b4c793 
3  vtkPolyDataAlgorithm::ProcessRequest             vtkPolyDataAlgorithm.cxx             87   0x7ffff28aaec6 
4  vtkExecutive::CallAlgorithm                      vtkExecutive.cxx                     734  0x7ffff287faf9 
5  vtkDemandDrivenPipeline::ExecuteData             vtkDemandDrivenPipeline.cxx          461  0x7ffff2876004 
6  vtkCompositeDataPipeline::ExecuteData            vtkCompositeDataPipeline.cxx         162  0x7ffff2869321 
7  vtkDemandDrivenPipeline::ProcessRequest          vtkDemandDrivenPipeline.cxx          260  0x7ffff28755b1 
8  vtkStreamingDemandDrivenPipeline::ProcessRequest vtkStreamingDemandDrivenPipeline.cxx 343  0x7ffff2943429 
9  vtkDemandDrivenPipeline::UpdateData              vtkDemandDrivenPipeline.cxx          418  0x7ffff2875de9 
10 vtkStreamingDemandDrivenPipeline::Update         vtkStreamingDemandDrivenPipeline.cxx 417  0x7ffff29437c4 
11 vtkStreamingDemandDrivenPipeline::Update         vtkStreamingDemandDrivenPipeline.cxx 380  0x7ffff294364d 
12 vtkAlgorithm::Update                             vtkAlgorithm.cxx                     1406 0x7ffff285f3b4 
13 vtkAlgorithm::Update                             vtkAlgorithm.cxx                     1400 0x7ffff285f37f 
14 main                                             main.cpp                             7    0x555555555297 

调试后,发现崩溃的直接原因是vtkAOSDataArrayTemplate<float>Buffernulldata实际是野指针:

template <class ValueTypeT>
double* vtkAOSDataArrayTemplate<ValueTypeT>::GetTuple(vtkIdType tupleIdx)
{
  ValueTypeT* data = this->Buffer->GetBuffer() + tupleIdx * this->NumberOfComponents;
  double* tuple = &this->LegacyTuple[0];
  // See note in SetTuple about std::copy vs for loops on MSVC.
  for (int i = 0; i < this->NumberOfComponents; ++i)
  {
    tuple[i] = static_cast<double>(data[i]);
  }
  return &this->LegacyTuple[0];
}

转到vtkOBJReader::RequestData调试,发现normals从未加入过值。以纯文本方式打开该OBJ,发现确实没有任何vn(vertex normal)字段,这也就解释了为何normals为空:

int vtkOBJReader::RequestData(vtkInformation* vtkNotUsed(request),
  vtkInformationVector** vtkNotUsed(inputVector), vtkInformationVector* outputVector)
{
    // ...
    if (n_normal_pts > 0)
    {
        new_normals->InsertNextTuple(normals->GetTuple(normal_pts[pointi]));
    }
    // ...
}

但等等,为什么n_normal_pts为3而非0?从变量命名上看,该变量应该记录的是normal的数量才对。n_normal_pts获取自normal_polys

int vtkOBJReader::RequestData(vtkInformation* vtkNotUsed(request),
  vtkInformationVector** vtkNotUsed(inputVector), vtkInformationVector* outputVector)
{
    //...
    tcoord_polys->GetNextCell(n_tcoord_pts, tcoord_pts);
    normal_polys->GetNextCell(n_normal_pts, normal_pts);
    //...
}

进一步调试,发现normal_polys是在以下片段修改的:

int vtkOBJReader::RequestData(vtkInformation* vtkNotUsed(request),
  vtkInformationVector** vtkNotUsed(inputVector), vtkInformationVector* outputVector)
{
    //...
    else if (strcmp(cmd, "f") == 0)
    {
        // face definition, consisting of 1-based indices separated by whitespace and /
        polys->InsertNextCell(0); // we don't yet know how many points are to come
        tcoord_polys->InsertNextCell(0);
        normal_polys->InsertNextCell(0);

        int nVerts = 0, nTCoords = 0, nNormals = 0; // keep a count of how many of each there are

        while (everything_ok && pLine < pEnd)
        {
            // find the first non-whitespace character
            while (isspace(*pLine) && pLine < pEnd)
            {
                pLine++;
            }

            if (pLine < pEnd) // there is still data left on this line
            {
                int iVert, iTCoord, iNormal;
                if (sscanf(pLine, "%d/%d/%d", &iVert, &iTCoord, &iNormal) == 3)
                {
                    if (iVert < 0)
                    {
                        polys->InsertCellPoint(numPoints + iVert);
                    }
                    else
                    {
                        polys->InsertCellPoint(iVert - 1);
                    }
                    nVerts++;

                    // Current index is relative to last texture index
                    int iTCoordAbs = (iTCoord < 0) ? numTCoords + iTCoord : iTCoord - 1;
                    tcoord_polys->InsertCellPoint(iTCoordAbs);

                    // Set the current texture array with the value corresponding to the
                    // iTcoords read
                    const auto& currentTCoord = verticesTextureList[iTCoordAbs];
                    auto iter = tcoords_map.find(tcoordsName);
                    vtkFloatArray* tcArray = iter->second;
                    tcArray->SetTuple2(iTCoordAbs, currentTCoord.first, currentTCoord.second);

                    nTCoords++;

                    // Current index is relative to last normal index
                    if (iNormal < 0)
                    {
                        normal_polys->InsertCellPoint(numNormals + iNormal);
                    }
                    else
                    {
                        normal_polys->InsertCellPoint(iNormal - 1);
                    }
                    nNormals++;
                    if (iTCoord != iVert)
                    {
                        tcoords_same_as_verts = false;
                    }
                    if (iNormal != iVert)
                    {
                        normals_same_as_verts = false;
                    }
                }
                // ...
            }
	}
    // ...
}

OBJ中的f(face)字段,按 http://paulbourke.net/dataformats/obj/ 对OBJ格式的介绍,可以用reference number以索引的方式引用出现过的顶点和法线数据:

Using v, vt, and vn to represent geometric vertices, texture vertices, and vertex normals, the statement would read:

f v/vt/vn v/vt/vn v/vt/vn v/vt/vn

再看OBJ文件,也的确有众多的f字段:

usemtl material_0
vt 0.648233 0.593283
vt 0.646499 0.584196
vt 0.655584 0.585689
f 2/1/2 3/2/3 1/3/1
vt 0.406369 0.356120
vt 0.403783 0.346783
vt 0.415342 0.350329
f 5/4/5 6/5/6 4/6/4

而正如上文所言,该OBJ并没有任何vn字段,即f字段引用的顶点法线都是无效的。vtkOBJReader将引用的法线数视为已插入的法线数,这就导致了n_normal_pts非0,而normals实际没有任何数据。

Fix

原因探查清楚后,对于这类野指针问题,修复措施也很简单,加个判断即可:


               }
             }
             // copy the normal for this point across (if there is one)
-            if (n_normal_pts > 0)
+            if (n_normal_pts > 0 && normals->GetNumberOfTuples() > 0)
             {
               new_normals->InsertNextTuple(normals->GetTuple(normal_pts[pointi]));
             }

Merge Request

修复已合并至VTK主干:https://gitlab.kitware.com/vtk/vtk/-/merge_requests/9827?commit_id=4b7d53504bb3e786517024fe32dc5c80e7f6e2c4