Help Extracting PDF with many small tables #1170
Replies: 2 comments 1 reply
-
Basically you want to chop up the page into each indivial table area (with If you look at each "Bid Ask" header row, one possible definition of the table area could be:
You can search for each "Bid Ask" header area and use You can then search for the nearest top/left/right lines to use as the crop borders. import pdfplumber
from bisect import bisect_left, bisect_right
from operator import itemgetter
from pdfplumber.utils import cluster_objects
pdf = pdfplumber.open("Downloads/Karbone.Prices.July.8.2022.pdf")
page = pdf.pages[0]
vertical_lines = [
min(col, key=itemgetter("x0"))["x0"]
for col in cluster_objects(page.vertical_edges, itemgetter("x0"), tolerance=3)
]
horizontal_lines = [
min(col, key=itemgetter("top"))["top"]
for col in cluster_objects(page.horizontal_edges, itemgetter("top"), tolerance=3)
]
cols = cluster_objects(
page.search("Bid Ask"), itemgetter("x0"), tolerance=3
)
for col_num, col in enumerate(cols):
left = vertical_lines[
bisect_left(vertical_lines, col[0]["x0"]) - 2 # second line to left
]
right = vertical_lines[
bisect_right(vertical_lines, col[0]["x0"]) + 1 # next line to right
]
for row_num, row in enumerate(col):
top = horizontal_lines[
bisect_left(horizontal_lines, row["top"]) - 1 # line above
]
try:
bottom = col[row_num + 1]["top"]
except IndexError:
bottom = page.bbox[-1] # if no "Bid Ask" below us, use page bottom
crop = page.crop((left, top, right, bottom))
crop.to_image(200, antialias=True).save(f"crop-{col_num}-{row_num}.png") Some example results: It's not perfect, one result from the final column doesn't catch the top header cleanly: Perhaps a more robust approach is to use the position of the text on the previous line directly above each "Bid Ask" section instead of Some extra work is also needed if you need to group the tables by subsection markers, e.g. PJM, NEPOOL |
Beta Was this translation helpful? Give feedback.
-
I think this could perhaps be a more "accurate" approach than #1170 (comment) Instead of using horizontal lines, it first finds the characters that are "in line" with each "Bid column". It then finds the "nearest word" that is above the "Bid" i.e. the "table name". import pdfplumber
from bisect import bisect_left, bisect_right
from operator import itemgetter
from pdfplumber.utils import cluster_objects, within_bbox, obj_to_bbox
from pdfplumber.utils.text import WordExtractor
pdf = pdfplumber.open("Downloads/Karbone.Prices.July.8.2022.pdf")
page = pdf.pages[0]
bid_ask = page.search(r"Bid\s+Ask")
# Group tables into "columns" which makes it easier when locating nearest objects
columns = cluster_objects(bid_ask, itemgetter("x0"), tolerance=3)
tables = [ [] for _ in columns ]
page_words = (
WordExtractor(keep_blank_chars=True, use_text_flow=True)
.iter_extract_tuples(page.chars)
)
page_chars = {}
for word, word_chars in page_words:
for char in word_chars:
page_chars[char["matrix"]] = dict(word=word, chars=word_chars)
vertical_lines = [
min(col, key=itemgetter("x0"))["x0"]
for col in cluster_objects(page.vertical_edges, itemgetter("x0"), tolerance=3)
]
for col_num, col in enumerate(columns):
left = vertical_lines[
bisect_left(vertical_lines, col[0]["x0"]) - 2 # second line to left
]
right = vertical_lines[
bisect_right(vertical_lines, col[0]["x0"]) + 1 # next line to right
]
col = sorted(col, key=itemgetter("top"))
for row_num, bid in enumerate(col):
cluster = next(
cluster for cluster in
cluster_objects(page.chars + [bid], itemgetter("x0"), tolerance=1)
if bid in cluster
)
words_in_cluster = {}
for obj in cluster:
if obj is not bid:
if obj["matrix"] in page_chars:
"""
As a word can contain multiple matching chars
we use the matrix coords as a way to remove duplicate entries
"""
word = page_chars[obj["matrix"]]["word"]
matrix = page_chars[obj["matrix"]]["chars"][0]["matrix"]
words_in_cluster[matrix] = word
words_in_cluster = list(words_in_cluster.values())
header = words_in_cluster[bisect_left(words_in_cluster, bid["top"], key=itemgetter("bottom")) - 1]
tables[col_num].append(
dict(bid=bid, header=header, left=left, top=bid["bottom"], right=right)
)
for col_num, col in enumerate(tables):
# the bottom of each table is the top of the next header
for idx in range(len(col) - 1):
col[idx]["bottom"] = col[idx + 1]["header"]["top"]
col[-1]["bottom"] = page.bbox[-1] # last table, set page height as bottom
for row_num, table in enumerate(col):
name = table["header"]["text"]
crop = page.crop((table["left"], table["top"], table["right"], table["bottom"]))
print(f"{name=}")
#crop.extract_table()
#crop.to_image(200, antialias=True).save(f"crop-{col_num}-{row_num}.png") |
Beta Was this translation helpful? Give feedback.
-
I am attempting to scrape a pdf containing a bunch of pricing data, and I am having trouble getting pdf plumber to identify each smaller table in a consistent matter. I have tried filtering based on stroke color and using the curves and edges to define explicit lines. Which has definetly gotten me closer to the end goal, but the results are still inconsistent.
This is the unmarked pdf:
This is the output of debug_table finder:
This is my code so far:
def inside(self, other):
return all((
self['x0'] >= other['x0'],
self['top'] >= other['top'],
self['x1'] <= other['x1'],
self['bottom'] <= other['bottom']
))
def largest_parent_rect(page, self):
parent_rects = [other for other in page.rects if inside(self, other)]
if parent_rects:
parent_rect = max(parent_rects, key=itemgetter('width', 'height'))
if self != parent_rect:
return parent_rect
def remove_nested_rects(page, keep_largest=False):
def filter_condition(other):
if other['object_type'] == 'rect':
return tuple(other['pts']) not in rects_to_remove
return True
def keep_visible_lines(obj):
if obj['object_type'] == 'rect':
return obj['non_stroking_color'] == [1]
return True
frames = []
for file in glob.glob("C:\Users\jfitzpatrick\Desktop\Fix Karbone\*.pdf"):
print(file)
start_date_formatted = ""
end_date_formatted = ""
insert = False
tables = []
with pdfplumber.open(file,repair=False) as pdf:
Here is the pdf:
Karbone Prices July 8, 2022.pdf
Any help is greatly appreciated, thank you for such an awesome package. I have been using it for many other projects.
Beta Was this translation helpful? Give feedback.
All reactions