Read and extract PDF text in C# and VB.NET
When reading a PDF file's text content, GemBox.Document will represent it with Paragraph
and Table
elements by using various heuristics to recognize the PDF file's logical structure. You can read more about this on the Support level for reading PDF format help page.
If you don't need the logical structure, but instead want to know the exact position of the text (e.g., on which page and coordinates some text is located), take a look at this alternative approach for reading PDF text using GemBox.Pdf.
The following example shows how you can read a PDF file with GemBox.Document and extract the file's properties as well as text that matches a specified regular expression.

using System;
using System.Text.RegularExpressions;
using GemBox.Document;
class Program
{
static void Main()
{
// If using Professional version, put your serial key below.
ComponentInfo.SetLicense("FREE-LIMITED-KEY");
DocumentModel document = DocumentModel.Load("%InputFileName%");
DocumentProperties properties = document.DocumentProperties;
// Read PDF file's properties.
Console.WriteLine($"Author: {properties.BuiltIn[BuiltInDocumentProperty.Author]}");
Console.WriteLine($"Created on: {properties.BuiltIn[BuiltInDocumentProperty.DateContentCreated]}");
Console.WriteLine();
// Read PDF file's text content and match specified regular expression.
var text = document.Content.ToString();
var regex = new Regex(@"(?<Hours>\d+)\s+(?<Unit>\d+\.\d{2})\s+(?<Price>\d+\.\d{2})");
foreach (Match match in regex.Matches(text))
{
var groups = match.Groups;
Console.WriteLine($"Hours={groups["Hours"]} | Unit={groups["Unit"]} | Price={groups["Price"]}");
}
}
}
Imports System
Imports System.Text.RegularExpressions
Imports GemBox.Document
Module Program
Sub Main()
' If using Professional version, put your serial key below.
ComponentInfo.SetLicense("FREE-LIMITED-KEY")
Dim document As DocumentModel = DocumentModel.Load("%InputFileName%")
Dim properties As DocumentProperties = document.DocumentProperties
' Read PDF file's properties.
Console.WriteLine($"Author: {properties.BuiltIn(BuiltInDocumentProperty.Author)}")
Console.WriteLine($"Created on: {properties.BuiltIn(BuiltInDocumentProperty.DateContentCreated)}")
Console.WriteLine()
' Read PDF file's text content and match specified regular expression.
Dim text = document.Content.ToString()
Dim regex As New Regex("(?<Hours>\d+)\s+(?<Unit>\d+\.\d{2})\s+(?<Price>\d+\.\d{2})")
For Each match As Match In regex.Matches(text)
Dim groups = match.Groups
Console.WriteLine($"Hours={groups("Hours")} | Unit={groups("Unit")} | Price={groups("Price")}")
Next
End Sub
End Module
Want more?
Like it?
Published: October 17, 2018 | Modified: December 3, 2019 | Author: Mario Zorica