SimplePdfReader.cs

64 lines | 1.84 kB Blame History Raw Download
using System;
using System.Text;
using System.IO;
using System.Threading.Tasks;

using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Canvas.Parser;
using iText.Kernel.Pdf.Canvas.Parser.Listener;

namespace Tools.PdfProvider
{
    public class SimplePdfReader
    {
        public Task<string> ReadPdfAsync(
            Stream stream, 
            bool ignoreError
            )
        {
            try
            {

                StringBuilder text = new StringBuilder();

                using (PdfReader iTextReader = new PdfReader(stream))
                using (PdfDocument pdfDoc = new PdfDocument(iTextReader))
                {
                    int numberofpages = pdfDoc.GetNumberOfPages();
                    for (int page = 1; page <= numberofpages; page++)
                    {
                        ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
                        string currentText = PdfTextExtractor.GetTextFromPage(
                            pdfDoc.GetPage(page),
                            strategy
                            );

                        //currentText = Encoding.UTF8.GetString(
                        //    ASCIIEncoding.Convert(
                        //        Encoding.Default,
                        //        Encoding.UTF8,
                        //        Encoding.Default.GetBytes(currentText)
                        //        )
                        //    );
                        text.Append(currentText);
                    }
                }


                return Task.FromResult(
                    text.ToString()
                    );
            }
            catch (Exception ex)
            {
                if (ignoreError)
                {
                    return Task.FromResult(ex.Message);
                }

                throw;
            }
        }
    }
}