Open and read Word file in C# and VB.NET

With GemBox.Document you can read many Word files (DOCX, DOC, RTF, HTML, etc.) in the same manner. The files can be loaded from a path or a stream in your C# and VB.NET application by using one of the DocumentModel.Load methods.

You can specify what file format your Word document is by providing an object of LoadOptions derived class (like DocxLoadOptions, DocLoadOptions, RtfLoadOptions, HtmlLoadOptions, etc.). Or you can let GemBox.Document choose the appropriate options for you when opening the file by omitting the LoadOptions.

The following example demonstrates the easiest way how you can read the text from a Word file.

Screenshot of read text from input Word document
Reading Word document's text in C# and VB.NET
Upload your file (Drag file here)
using System;
using System.Linq;
using System.Text.RegularExpressions;
using GemBox.Document;

class Program
{
    static void Main()
    {
        // If using Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY");

        // Load Word document from file's path.
        var document = DocumentModel.Load("%InputFileName%");

        // Get Word document's plain text.
        string text = document.Content.ToString();

        // Get Word document's count statistics.
        int charactersCount = text.Replace(Environment.NewLine, string.Empty).Length;
        int wordsCount = Regex.Matches(text, @"[\S]+").Count;
        int paragraphsCount = document.GetChildElements(true, ElementType.Paragraph).Count();
        int pageCount = document.GetPaginator().Pages.Count;

        // Display file's count statistics.
        Console.WriteLine($"Characters count: {charactersCount}");
        Console.WriteLine($"     Words count: {wordsCount}");
        Console.WriteLine($"Paragraphs count: {paragraphsCount}");
        Console.WriteLine($"     Pages count: {pageCount}");
        Console.WriteLine();

        // Display file's text content.
        Console.WriteLine(text);
    }
}
Imports System
Imports System.Linq
Imports System.Text.RegularExpressions
Imports GemBox.Document

Module Program

    Sub Main()

        ' If using Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY")

        ' Load Word document from file's path.
        Dim document = DocumentModel.Load("%InputFileName%")

        ' Get Word document's plain text.
        Dim text As String = document.Content.ToString()

        ' Get Word document's count statistics.
        Dim charactersCount As Integer = text.Replace(Environment.NewLine, String.Empty).Length
        Dim wordsCount As Integer = Regex.Matches(text, "[\S]+").Count
        Dim paragraphsCount As Integer = document.GetChildElements(True, ElementType.Paragraph).Count()
        Dim pageCount As Integer = document.GetPaginator().Pages.Count

        ' Display file's count statistics.
        Console.WriteLine($"Characters count: {charactersCount}")
        Console.WriteLine($"     Words count: {wordsCount}")
        Console.WriteLine($"Paragraphs count: {paragraphsCount}")
        Console.WriteLine($"     Pages count: {pageCount}")
        Console.WriteLine()

        ' Display file's text content.
        Console.WriteLine(text)

    End Sub
End Module

Reading Word document's elements

Beside reading the text of the whole document, you can also read just some part of it like specific Section element, or HeaderFooter element, etc. Each element has a Content property with which you can extract its text via Content.ToString method.

The following example demonstrates how you can traverse through all Paragraph elements and their child Run elements, read their text and their formatting. To read more about the visual information of the content elements, see the Formattings and Styles help page.

Screenshot of read elements from input Word document
Reading Word document's text and formatting in C# and VB.NET
Upload your file (Drag file here)
using System;
using System.IO;
using System.Linq;
using GemBox.Document;

class Program
{
    static void Main()
    {
        // If using Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY");

        var document = DocumentModel.Load("%InputFileName%");
        using (var writer = File.CreateText("Output.txt"))
        {
            // Iterate through all Paragraph elements in the Word document.
            foreach (Paragraph paragraph in document.GetChildElements(true, ElementType.Paragraph))
            {
                // Iterate through all Run elements in the Paragraph element.
                foreach (Run run in paragraph.GetChildElements(true, ElementType.Run))
                {
                    string text = run.Text;
                    CharacterFormat format = run.CharacterFormat;

                    // Replace text with bold formatting to 'Mathematical Bold Italic' Unicode characters.
                    // For instance, "ABC" to "𝑨𝑩𝑪".
                    if (format.Bold)
                    {
                        text = string.Concat(text.Select(
                            c => c >= 'A' && c <= 'Z' ? char.ConvertFromUtf32(119847 + c) :
                                 c >= 'a' && c <= 'z' ? char.ConvertFromUtf32(119841 + c) :
                                 c.ToString()));
                    }

                    writer.Write(text);
                }

                writer.WriteLine();
            }
        }
    }
}
Imports System
Imports System.IO
Imports System.Linq
Imports GemBox.Document

Module Program

    Sub Main()

        ' If using Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY")

        Dim document = DocumentModel.Load("%InputFileName%")
        Using writer = File.CreateText("Output.txt")

            ' Iterate through all Paragraph elements in the Word document.
            For Each paragraph As Paragraph In document.GetChildElements(True, ElementType.Paragraph)

                ' Iterate through all Run elements in the Paragraph element.
                For Each run As Run In paragraph.GetChildElements(True, ElementType.Run)

                    Dim text As String = run.Text
                    Dim format As CharacterFormat = run.CharacterFormat

                    ' Replace text with bold formatting to 'Mathematical Bold Italic' Unicode characters.
                    ' For instance, "ABC" to "𝑨𝑩𝑪".
                    If format.Bold Then
                        text = String.Concat(text.Select(
                            Function(c)
                                Return If(c >= "A"c AndAlso c <= "Z"c, Char.ConvertFromUtf32(119847 + AscW(c)),
                                       If(c >= "a"c AndAlso c <= "z"c, Char.ConvertFromUtf32(119841 + AscW(c)),
                                       c.ToString()))
                            End Function))
                    End If

                    writer.Write(text)
                Next

                writer.WriteLine()
            Next
        End Using

    End Sub
End Module

By combining these two examples you can achieve various tasks, like selecting only the Table elements and reading their text content, or selecting only the Picture elements and extracting their images, or reading the Run.Text property of only the highlighted elements (the ones that have CharacterFormat.HighlightColor), etc.

Reading Word document's pages

Word files (DOCX, DOC, RTF, HTML, etc.) don't have a page concept, which means they don't contain the information about how many pages they occupy nor which element is on which page.

They are of a flow document type and their content is written in a flow-able manner. The page concept is specific to a Word application(s) which is rendering or displaying the document.

On the other hand, files of fixed document type (PDF, XPS, etc.) do have a page concept. Their content is fixed, it's defined on which exact page location the elements are rendered, drawn.

GemBox.Document uses its rendering engine to paginate and render the document's content when saving to PDF, XPS or image format. So, the best and the easiest way to read the text content of some specific page would be to convert Word document to PDF file (or save just a specific Word page as PDF) with GemBox.Document and then read PDF page's text content with our other component, GemBox.Pdf.

Nevertheless, the following example demonstrates how you can use GemBox.Document's rendering engine to retrieve each document page as FrameworkElement object from WPF framework and then extract text from it using the provided FrameworkElement.ToText extension method.

Screenshot of read page from input Word document
Reading Word document's page in C# and VB.NET
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Windows;
using System.Windows.Media;
using GemBox.Document;

class Program
{
    [STAThread]
    static void Main()
    {
        // If using Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY");

        var document = DocumentModel.Load("Reading.docx");
        var pages = document.GetPaginator().Pages;

        for (int i = 0, count = pages.Count; i < count; ++i)
        {
            Console.WriteLine(new string('-', 50));
            Console.WriteLine($"Page {i + 1} of {count}");
            Console.WriteLine(new string('-', 50));

            // Get FrameworkElement object from Word document's page.
            DocumentModelPage page = pages[i];
            FrameworkElement pageContent = pages[i].PageContent;

            // Extract text from FrameworkElement object.
            Console.WriteLine(pageContent.ToText());
        }
    }
}

/// <summary>
/// Contains methods that are used to extract text out of a FrameworkElement object.
/// </summary>
public static class GemBoxDocumentHelper
{
    public static string ToText(this FrameworkElement root)
    {
        var builder = new StringBuilder();

        foreach (var visual in root.GetSelfAndDescendants().OfType<DrawingVisual>())
        {
            GlyphRun previousRun = null;

            // Order runs first vertically (Y), then horizontally (X).
            foreach (var currentRun in visual.Drawing
                .GetSelfAndDescendants()
                .OfType<GlyphRunDrawing>()
                .Select(glyph => glyph.GlyphRun)
                .OrderBy(run => run.BaselineOrigin.Y)
                .ThenBy(run => run.BaselineOrigin.X))
            {
                if (previousRun != null)
                {
                    // If base-line of current text segment is left from base-line of previous text segment, then assume that it is new line.
                    if (currentRun.BaselineOrigin.X <= previousRun.BaselineOrigin.X)
                    {
                        builder.AppendLine();
                    }
                    else
                    {
                        Rect currentRect = currentRun.ComputeAlignmentBox();
                        Rect previousRect = previousRun.ComputeAlignmentBox();

                        double spaceWidth = currentRun.BaselineOrigin.X + currentRect.Left - previousRun.BaselineOrigin.X - previousRect.Right;
                        double spaceHeight = (currentRect.Height + previousRect.Height) / 2;

                        // If space between successive text segments has width greater than a sixth of its height, then assume that it is a word (add a space).
                        if (spaceWidth > spaceHeight / 6)
                            builder.Append(' ');
                    }
                }

                builder.Append(currentRun.Characters.ToArray());
                previousRun = currentRun;
            }
        }

        return builder.ToString();
    }

    private static IEnumerable<DependencyObject> GetSelfAndDescendants(this DependencyObject parent)
    {
        yield return parent;

        for (int i = 0, count = VisualTreeHelper.GetChildrenCount(parent); i < count; i++)
            foreach (var descendant in VisualTreeHelper.GetChild(parent, i).GetSelfAndDescendants())
                yield return descendant;
    }

    private static IEnumerable<Drawing> GetSelfAndDescendants(this DrawingGroup parent)
    {
        yield return parent;

        foreach (var child in parent.Children)
        {
            var group = child as DrawingGroup;
            if (group != null)
                foreach (var descendant in group.GetSelfAndDescendants())
                    yield return descendant;
            else
                yield return child;
        }
    }
}
Imports System
Imports System.Collections.Generic
Imports System.Linq
Imports System.Text
Imports System.Windows
Imports System.Windows.Media
Imports GemBox.Document

Module Program

    <STAThread>
    Sub Main()

        ' If using Professional version, put your serial key below.
        ComponentInfo.SetLicense("FREE-LIMITED-KEY")

        Dim document = DocumentModel.Load("Reading.docx")
        Dim pages = document.GetPaginator().Pages
        Dim count = pages.Count

        For i = 0 To count - 1
            Console.WriteLine(New String("-"c, 50))
            Console.WriteLine($"Page {i + 1} of {count}")
            Console.WriteLine(New String("-"c, 50))

            ' Get FrameworkElement object from Word document's page.
            Dim page As DocumentModelPage = pages(i)
            Dim pageContent As FrameworkElement = pages(i).PageContent

            ' Extract text from FrameworkElement object.
            Console.WriteLine(pageContent.ToText())
        Next

    End Sub
End Module

''' <summary>
''' Contains methods that are used to extract text out of a FrameworkElement object.
''' </summary>
Module GemBoxDocumentHelper
    <Runtime.CompilerServices.Extension>
    Function ToText(ByVal root As FrameworkElement) As String
        Dim builder As New StringBuilder()

        For Each visual In root.GetSelfAndDescendants().OfType(Of DrawingVisual)()
            Dim previousRun As GlyphRun = Nothing

            ' Order runs first vertically (Y), then horizontally (X).
            For Each currentRun In visual.Drawing _
                .GetSelfAndDescendants() _
                .OfType(Of GlyphRunDrawing)() _
                .Select(Function(glyph) glyph.GlyphRun) _
                .OrderBy(Function(run) run.BaselineOrigin.Y) _
                .ThenBy(Function(run) run.BaselineOrigin.X)

                If previousRun IsNot Nothing Then
                    ' If base-line of current text segment is left from base-line of previous text segment, then assume that it is new line.
                    If currentRun.BaselineOrigin.X <= previousRun.BaselineOrigin.X Then
                        builder.AppendLine()
                    Else
                        Dim currentRect As Rect = currentRun.ComputeAlignmentBox()
                        Dim previousRect As Rect = previousRun.ComputeAlignmentBox()

                        Dim spaceWidth As Double = currentRun.BaselineOrigin.X + currentRect.Left - previousRun.BaselineOrigin.X - previousRect.Right
                        Dim spaceHeight As Double = (currentRect.Height + previousRect.Height) / 2

                        ' If space between successive text segments has width greater than a sixth of its height, then assume that it is a word (add a space).
                        If spaceWidth > spaceHeight / 6 Then builder.Append(" "c)
                    End If
                End If

                builder.Append(currentRun.Characters.ToArray())
                previousRun = currentRun
            Next
        Next

        Return builder.ToString()
    End Function

    <Runtime.CompilerServices.Extension>
    Private Iterator Function GetSelfAndDescendants(ByVal parent As DependencyObject) As IEnumerable(Of DependencyObject)
        Yield parent

        Dim count = VisualTreeHelper.GetChildrenCount(parent)
        For i = 0 To count - 1
            For Each descendant In VisualTreeHelper.GetChild(parent, i).GetSelfAndDescendants()
                Yield descendant
            Next
        Next
    End Function

    <Runtime.CompilerServices.Extension>
    Private Iterator Function GetSelfAndDescendants(ByVal parent As DrawingGroup) As IEnumerable(Of Drawing)
        Yield parent

        For Each child In parent.Children
            Dim group = TryCast(child, DrawingGroup)
            If group IsNot Nothing Then
                For Each descendant In group.GetSelfAndDescendants()
                    Yield descendant
                Next
            Else
                Yield child
            End If
        Next
    End Function
End Module

Check next example or download examples from GitHub.