Extract text and tables from PDF in C# and VB.NET
When reading the text content of a PDF file, GemBox.Document will recognize the file's logical structure and represent it using Table
and Paragraph
elements. You can read more about how GemBox.Document detects a PDF's structure on the Support level for reading PDF format help page.
If you don't need the logical structure, but instead want to know the exact position of the text (e.g., on which page and coordinates some text is located), take a look at this alternative approach for reading PDF text using GemBox.Pdf.
The following example shows how you can read paragraphs and tables from a PDF file using GemBox.Document.

using System;
using System.Linq;
using GemBox.Document;
using GemBox.Document.Tables;
class Program
{
static void Main()
{
// If using the Professional version, put your serial key below.
ComponentInfo.SetLicense("FREE-LIMITED-KEY");
var document = DocumentModel.Load("%InputFileName%");
// Display file's properties.
var properties = document.DocumentProperties;
Console.WriteLine($"Title: {properties.BuiltIn[BuiltInDocumentProperty.Title]}");
Console.WriteLine($"Author: {properties.BuiltIn[BuiltInDocumentProperty.Author]}");
Console.WriteLine();
// Get paragraphs.
var paragraphs = document.GetChildElements(true, ElementType.Paragraph).Cast<Paragraph>();
// Get tables.
var tables = document.GetChildElements(true, ElementType.Table).Cast<Table>();
// Display paragraphs and tables count.
Console.WriteLine($"Paragraph count: {paragraphs.Count()}");
Console.WriteLine($"Table count: {tables.Count()}");
Console.WriteLine();
// Display first paragraph's content.
var paragraph = paragraphs.First();
Console.WriteLine("Paragraph content:");
Console.WriteLine(paragraph.Content.ToString());
// Display last table's content.
var table = tables.Last();
Console.WriteLine("Table content:");
foreach (var row in table.Rows)
{
Console.WriteLine(new string('-', 56));
foreach (var cell in row.Cells)
Console.Write($"{cell.Content.ToString().TrimEnd().PadRight(13)}|");
Console.WriteLine();
}
}
}
Imports System
Imports System.Linq
Imports GemBox.Document
Imports GemBox.Document.Tables
Module Program
Sub Main()
' If using the Professional version, put your serial key below.
ComponentInfo.SetLicense("FREE-LIMITED-KEY")
Dim document = DocumentModel.Load("%InputFileName%")
' Display file's properties.
Dim properties = document.DocumentProperties
Console.WriteLine($"Title: {properties.BuiltIn(BuiltInDocumentProperty.Title)}")
Console.WriteLine($"Author: {properties.BuiltIn(BuiltInDocumentProperty.Author)}")
Console.WriteLine()
' Get paragraphs.
Dim paragraphs = document.GetChildElements(True, ElementType.Paragraph).Cast(Of Paragraph)()
' Get tables.
Dim tables = document.GetChildElements(True, ElementType.Table).Cast(Of Table)()
' Display paragraphs and tables count.
Console.WriteLine($"Paragraph count: {paragraphs.Count()}")
Console.WriteLine($"Table count: {tables.Count()}")
Console.WriteLine()
' Display first paragraph's content.
Dim paragraph = paragraphs.First()
Console.WriteLine("Paragraph content:")
Console.WriteLine(paragraph.Content.ToString())
' Display last table's content.
Dim table = tables.Last()
Console.WriteLine("Table content:")
For Each row In table.Rows
Console.WriteLine(New String("-"c, 56))
For Each cell In row.Cells
Console.Write($"{cell.Content.ToString().TrimEnd().PadRight(13)}|")
Next
Console.WriteLine()
Next
End Sub
End Module
See also
Next steps
Published: October 17, 2018 | Modified: December 19, 2022 | Author: Mario Zorica