PDF数据提取------3.解析Demo

96 阅读 0 评论 64 点赞

我是靠谱客的博主欢喜寒风，这篇文章主要介绍PDF数据提取------3.解析Demo，现在分享给大家，希望可以做个参考。

1.PDF中文本字符串格式中关键值信息抓取（已完成）

简介:这种解析比较传统最简单主要熟练使用Regular Expression做语义识别和验证.例如抓取下面红色圈内关键信息

1.1

复制代码

string mettingData=GetMeetingData();
public string GetMeetingData()
{
string patternAll = @"(?<NDAandCAMDate>会s*议s*.{2,15}d{2,4}s*年s*d{1,2}s*月s*d{1,2}s*日.{0,15})";
PdfAnalyzer pa = new PdfAnalyzer();
PDFNet.Initialize();
PDFDoc doc = new PDFDoc(item);
doc.InitSecurityHandler();
List<PdfString> foundAll = pa.RegexSearchAllPages(doc, patternAll);
List<string> patternFilter = new List<string>();
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?(上午)?(?<hour>d{1,2})(:|点|时)(?<minute>d{1,2})");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?下午(?<hour>d{1,2})(:|点|时)(?<minute>d{1,2})");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?(上午)?(?<hour>d{1,2})点半");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?下午(?<hour>d{1,2})点半");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?(上午)?(?<hour>d{1,2})(点|时)");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?下午(?<hour>d{1,2})(点|时)");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日");
return GetMeetingDateFilter(foundAll, patternAll);
}
private string GetMeetingDateFilter(List<PdfString> foundAll, List<string> patternAll)
{
string meetingDate = "
";
Match ma = null;
string result = string.Empty;
foreach (PdfString pdfString in foundAll)
{
result = pdfString.ToString().Replace(" ", "");
for (int i = 0; i < patternAll.Count; i++)
{
ma = (new Regex(patternAll[i])).Match(result);
if (ma.Success)
{
if (IsValid(ma))
return meetingDate;
else
meetingDate = "
";
}
}
}
return meetingDate;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

string mettingData=GetMeetingData();
public string GetMeetingData()
{
string patternAll = @"(?<NDAandCAMDate>会s*议s*.{2,15}d{2,4}s*年s*d{1,2}s*月s*d{1,2}s*日.{0,15})";
PdfAnalyzer pa = new PdfAnalyzer();
PDFNet.Initialize();
PDFDoc doc = new PDFDoc(item);
doc.InitSecurityHandler();
List<PdfString> foundAll = pa.RegexSearchAllPages(doc, patternAll);
List<string> patternFilter = new List<string>();
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?(上午)?(?<hour>d{1,2})(:|点|时)(?<minute>d{1,2})");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?下午(?<hour>d{1,2})(:|点|时)(?<minute>d{1,2})");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?(上午)?(?<hour>d{1,2})点半");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?下午(?<hour>d{1,2})点半");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?(上午)?(?<hour>d{1,2})(点|时)");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日(((|（)(星期|周)(一|二|三|四|五|六|七)()|）))?下午(?<hour>d{1,2})(点|时)");
patternFilter.Add(@"(?<year>d{2,4})年(?<month>d{1,2})月(?<day>d{1,2})日");
return GetMeetingDateFilter(foundAll, patternAll);
}
private string GetMeetingDateFilter(List<PdfString> foundAll, List<string> patternAll)
{
string meetingDate = "
";
Match ma = null;
string result = string.Empty;
foreach (PdfString pdfString in foundAll)
{
result = pdfString.ToString().Replace(" ", "");
for (int i = 0; i < patternAll.Count; i++)
{
ma = (new Regex(patternAll[i])).Match(result);
if (ma.Success)
{
if (IsValid(ma))
return meetingDate;
else
meetingDate = "
";
}
}
}
return meetingDate;
}

注解：

a.第一次通过通过 pa.RegexSearchAllPages(doc, patternAll);搜索所有关于时间数据信息

b.第二次通过正则匹配获取带有关键词信息Meeting Data

2.PDF类似表格形式关键值数据抓取。（已完成）

简介:这种格式需要用的封装数据结构PdfString类和PdfAnalyzer类，根据给定关键词在指定范围提取数据,例如提取下面数据。

2.1

复制代码

private string GetPremium(string path, string ricCode)
{
string result = string.Empty;
PDFDoc doc = null;
try
{
PDFNet.Initialize();
doc = new PDFDoc(path);
doc.InitSecurityHandler();
if (doc == null)
{
string msg = string.Format("can't load pdf to doc = new PDFDoc({0}); ", path);
Logger.Log(msg, Logger.LogType.Error);
return result;
}
int x1 = 0;
int y1 = 0;
PdfAnalyzer pa = new PdfAnalyzer();
List<PdfString> listX1 = pa.RegexSearchAllPages(doc, ricCode);
List<PdfString> listY1 = pa.RegexSearchAllPages(doc, @"[P|p]remium");
List<PdfString> listResult = pa.RegexSearchAllPages(doc, @"(?<Result>d+.d+%)");
if (listX1.Count == 0 || listY1.Count == 0 || listResult.Count == 0)
{
string msg = string.Format("({0}),([P|p]remium) exist missing value ,so Gearing is empty value.", ricCode);
Logger.Log(msg, Logger.LogType.Warning);
return result;
}
x1 = System.Convert.ToInt32(listX1[0].Position.x1);
y1 = System.Convert.ToInt32(listY1[0].Position.y1);
int subX1 = 0;
int subY1 = 0;
//use Gearing position (x1,y1) to get the right result value
foreach (var item in listResult)
{
subX1 = x1 - System.Convert.ToInt32(item.Position.x1);
if (subX1 < 0) subX1 = 0 - subX1;
subY1 = y1 - System.Convert.ToInt32(item.Position.y1);
if (subY1 < 0) subY1 = 0 - subY1;
if (subX1 <= 10 && subY1 <= 10)
{
result = item.ToString().Replace("%", "");
return result;
}
}
Logger.Log(string.Format("stock code:{0},extract premium failed .", ricCode), Logger.LogType.Error);
return result;
}
catch (Exception ex)
{
string msg = string.Format("PDF analysis failed for " + ricCode + "! Action: Need manually input gearing and premium rn error msg:{0}", ex.Message);
Logger.Log(msg, Logger.LogType.Warning);
return result;
}
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
private string GetPremium(string path, string ricCode)
{
string result = string.Empty;
PDFDoc doc = null;
try
{
PDFNet.Initialize();
doc = new PDFDoc(path);
doc.InitSecurityHandler();
if (doc == null)
{
string msg = string.Format("can't load pdf to doc = new PDFDoc({0}); ", path);
Logger.Log(msg, Logger.LogType.Error);
return result;
}
int x1 = 0;
int y1 = 0;
PdfAnalyzer pa = new PdfAnalyzer();
List<PdfString> listX1 = pa.RegexSearchAllPages(doc, ricCode);
List<PdfString> listY1 = pa.RegexSearchAllPages(doc, @"[P|p]remium");
List<PdfString> listResult = pa.RegexSearchAllPages(doc, @"(?<Result>d+.d+%)");
if (listX1.Count == 0 || listY1.Count == 0 || listResult.Count == 0)
{
string msg = string.Format("({0}),([P|p]remium) exist missing value ,so Gearing is empty value.", ricCode);
Logger.Log(msg, Logger.LogType.Warning);
return result;
}
x1 = System.Convert.ToInt32(listX1[0].Position.x1);
y1 = System.Convert.ToInt32(listY1[0].Position.y1);
int subX1 = 0;
int subY1 = 0;
//use Gearing position (x1,y1) to get the right result value
foreach (var item in listResult)
{
subX1 = x1 - System.Convert.ToInt32(item.Position.x1);
if (subX1 < 0) subX1 = 0 - subX1;
subY1 = y1 - System.Convert.ToInt32(item.Position.y1);
if (subY1 < 0) subY1 = 0 - subY1;
if (subX1 <= 10 && subY1 <= 10)
{
result = item.ToString().Replace("%", "");
return result;
}
}
Logger.Log(string.Format("stock code:{0},extract premium failed .", ricCode), Logger.LogType.Error);
return result;
}
catch (Exception ex)
{
string msg = string.Format("PDF analysis failed for " + ricCode + "! Action: Need manually input gearing and premium rn error msg:{0}", ex.Message);
Logger.Log(msg, Logger.LogType.Warning);
return result;
}
}

3.需要PDF中大量数据转换到Excel中去（已完成）

简介:基与2的延伸，加入一个自动模糊匹配到行和列边界范围，根据位置坐标排序提取正确数据信息。如图：

2.2 2.3

复制代码

private void StartExtractFile()
{
List<List<string>> bulkFileFilter = null;
List<LineFound> bulkFile = null;
PDFNet.Initialize();
PDFDoc doc = new PDFDoc(config.FilePath1);
doc.InitSecurityHandler();
string patternTitle = @"コード";
int page = 3;
PdfString ricPosition = GetRicPosition(doc, patternTitle, page);
if (ricPosition == null)
return;
string patternRic = @"d{4}";
string patternValue = @"(-|+)?d+(,|.|d)+";
bulkFile = GetValue(doc, ricPosition, patternRic, patternValue);
int indexOK = 0;
bulkFileFilter = FilterBulkFile(bulkFile, indexOK);
string filePath = Path.Combine(config.OutputFolder, string.Format("Type1ExtractedFromPdf{0}.csv", DateTime.Now.ToString("dd-MM-yyyy")));
if (File.Exists(filePath))
File.Delete(filePath);
XlsOrCsvUtil.GenerateStringCsv(filePath, bulkFileFilter);
AddResult(Path.GetFileNameWithoutExtension(filePath), filePath, "type1");
}
private List<List<string>> FilterBulkFile(List<LineFound> bulkFile, int indexOK)
{
List<List<string>> result = new List<List<string>>();
if (bulkFile == null || bulkFile.Count == 0)
{
Logger.Log("no value data extract from pdf");
return null;
}
int count = bulkFile[indexOK].LineData.Count;
List<string> line = null;
foreach (var item in bulkFile)
{
if (item.LineData == null || item.LineData.Count <= 0)
continue;
line = new List<string>();
if (item.LineData.Count.CompareTo(count) == 0)
{
foreach (var value in item.LineData)
{
line.Add(value.Words.ToString());
}
}
else
{
line.Add(item.LineData[0].Words.ToString());
for (int i = 1; i < count; i++)
{
line.Add(string.Empty);
}
}
result.Add(line);
}
return result;
}
private List<LineFound> GetValue(PDFDoc doc, PdfString ricPosition, string patternRic, string patternValue)
{
List<LineFound> bulkFile = new List<LineFound>();
try
{
List<string> line = new List<string>();
List<PdfString> ric = null;
//for (int i = 1; i < 10; i++)
for (int i = 1; i < doc.GetPageCount(); i++)
{
ric = pa.RegexExtractByPositionWithPage(doc, patternRic, i, ricPosition.Position);
foreach (var item in ric)
{
LineFound lineFound = new LineFound();
lineFound.Ric = item.Words.ToString();
lineFound.Position = item.Position;
lineFound.PageNumber = i;
lineFound.LineData = pa.RegexExtractByPositionWithPage(doc, patternValue, i, item.Position, PositionRect.X2);
bulkFile.Add(lineFound);
}
}
}
catch (Exception ex)
{
string msg = string.Format("rn
ClassName:
{0}rn
MethodName: {1}rn
Message:
{2}",
System.Reflection.MethodBase.GetCurrentMethod().DeclaringType.ToString(),
System.Reflection.MethodBase.GetCurrentMethod().Name,
ex.Message);
Logger.Log(msg, Logger.LogType.Error);
}
return bulkFile;
}
private PdfString GetRicPosition(PDFDoc doc, string pattern, int page)
{
try
{
List<PdfString> ricPosition = null;
ricPosition = pa.RegexSearchByPage(doc, @"コード", page);
if (ricPosition == null || ricPosition.Count == 0)
{
Logger.Log(string.Format("there is no ric title found by using pattern:{0} to find the ric title ,in the page:{1} of the pdf:{2}"));
return null;
}
return ricPosition[0];
}
catch (Exception ex)
{
string msg = string.Format("rn
ClassName:
{0}rn
MethodName: {1}rn
Message:
{2}",
System.Reflection.MethodBase.GetCurrentMethod().DeclaringType.ToString(),
System.Reflection.MethodBase.GetCurrentMethod().Name,
ex.Message);
Logger.Log(msg, Logger.LogType.Error);
throw;
}
}
}
struct LineFound
{
public string Ric { get; set; }
public Rect Position { get; set; }
public int PageNumber { get; set; }
public List<PdfString> LineData { get; set; }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
private void StartExtractFile()
{
List<List<string>> bulkFileFilter = null;
List<LineFound> bulkFile = null;
PDFNet.Initialize();
PDFDoc doc = new PDFDoc(config.FilePath1);
doc.InitSecurityHandler();
string patternTitle = @"コード";
int page = 3;
PdfString ricPosition = GetRicPosition(doc, patternTitle, page);
if (ricPosition == null)
return;
string patternRic = @"d{4}";
string patternValue = @"(-|+)?d+(,|.|d)+";
bulkFile = GetValue(doc, ricPosition, patternRic, patternValue);
int indexOK = 0;
bulkFileFilter = FilterBulkFile(bulkFile, indexOK);
string filePath = Path.Combine(config.OutputFolder, string.Format("Type1ExtractedFromPdf{0}.csv", DateTime.Now.ToString("dd-MM-yyyy")));
if (File.Exists(filePath))
File.Delete(filePath);
XlsOrCsvUtil.GenerateStringCsv(filePath, bulkFileFilter);
AddResult(Path.GetFileNameWithoutExtension(filePath), filePath, "type1");
}
private List<List<string>> FilterBulkFile(List<LineFound> bulkFile, int indexOK)
{
List<List<string>> result = new List<List<string>>();
if (bulkFile == null || bulkFile.Count == 0)
{
Logger.Log("no value data extract from pdf");
return null;
}
int count = bulkFile[indexOK].LineData.Count;
List<string> line = null;
foreach (var item in bulkFile)
{
if (item.LineData == null || item.LineData.Count <= 0)
continue;
line = new List<string>();
if (item.LineData.Count.CompareTo(count) == 0)
{
foreach (var value in item.LineData)
{
line.Add(value.Words.ToString());
}
}
else
{
line.Add(item.LineData[0].Words.ToString());
for (int i = 1; i < count; i++)
{
line.Add(string.Empty);
}
}
result.Add(line);
}
return result;
}
private List<LineFound> GetValue(PDFDoc doc, PdfString ricPosition, string patternRic, string patternValue)
{
List<LineFound> bulkFile = new List<LineFound>();
try
{
List<string> line = new List<string>();
List<PdfString> ric = null;
//for (int i = 1; i < 10; i++)
for (int i = 1; i < doc.GetPageCount(); i++)
{
ric = pa.RegexExtractByPositionWithPage(doc, patternRic, i, ricPosition.Position);
foreach (var item in ric)
{
LineFound lineFound = new LineFound();
lineFound.Ric = item.Words.ToString();
lineFound.Position = item.Position;
lineFound.PageNumber = i;
lineFound.LineData = pa.RegexExtractByPositionWithPage(doc, patternValue, i, item.Position, PositionRect.X2);
bulkFile.Add(lineFound);
}
}
}
catch (Exception ex)
{
string msg = string.Format("rn
ClassName:
{0}rn
MethodName: {1}rn
Message:
{2}",
System.Reflection.MethodBase.GetCurrentMethod().DeclaringType.ToString(),
System.Reflection.MethodBase.GetCurrentMethod().Name,
ex.Message);
Logger.Log(msg, Logger.LogType.Error);
}
return bulkFile;
}
private PdfString GetRicPosition(PDFDoc doc, string pattern, int page)
{
try
{
List<PdfString> ricPosition = null;
ricPosition = pa.RegexSearchByPage(doc, @"コード", page);
if (ricPosition == null || ricPosition.Count == 0)
{
Logger.Log(string.Format("there is no ric title found by using pattern:{0} to find the ric title ,in the page:{1} of the pdf:{2}"));
return null;
}
return ricPosition[0];
}
catch (Exception ex)
{
string msg = string.Format("rn
ClassName:
{0}rn
MethodName: {1}rn
Message:
{2}",
System.Reflection.MethodBase.GetCurrentMethod().DeclaringType.ToString(),
System.Reflection.MethodBase.GetCurrentMethod().Name,
ex.Message);
Logger.Log(msg, Logger.LogType.Error);
throw;
}
}
}
struct LineFound
{
public string Ric { get; set; }
public Rect Position { get; set; }
public int PageNumber { get; set; }
public List<PdfString> LineData { get; set; }
}