Commit 4ea20064 authored by Luis Salamanca's avatar Luis Salamanca
Browse files

A more or less usable code for title completion

parent c32c808e
......@@ -304,15 +304,25 @@ class Document:
img_aux[img_aux < 20] = 0
img_aux[img_aux >= 20] = 255
sum_img = np.sum(img_aux, axis = 0)
mean_sum_img = np.mean(sum_img) * 1.5
num_pix_comp = int(len(sum_img)/20)
if (np.mean(sum_img[:num_pix_comp] > mean_sum_img) or np.mean(sum_img[-num_pix_comp:] > mean_sum_img)):
#mean_sum_img = np.mean(sum_img) * 1.5
firt_25 = int(2*len(sum_img)/5)
second_25 = int(3*len(sum_img)/5)
mean_left = 1.5 * np.mean(sum_img[:firt_25])
mean_right = 1.5 * np.mean(sum_img[second_25:])
num_pix_comp = int(len(sum_img)/20) # The larger the denominator, the more likely we remove black margins
if (np.sum(sum_img[:firt_25] > mean_left) > num_pix_comp) or (np.sum(sum_img[second_25:] > mean_right) > num_pix_comp):
print('Removing black margins caused by scanning')
flag_remove = True
if flag_remove:
coord_ad, _ = plot_tools.adapt_coordtoimg(imarray, coord_textline, bbox_page)
min_left_coord, max_right_coord, ind_valid_coord = preproc_docs.remove_black_sides(imarray, coord_ad)
try:
min_left_coord, max_right_coord, ind_valid_coord = preproc_docs.remove_black_sides(imarray, coord_ad)
except:
min_left_coord = 0
max_right_coord = imarray.shape[1]
ind_valid_coord = np.arange(coord_ad.shape[1])
print('Error while removing black margins. Not doing anything!')
coord_textline = coord_textline[:,ind_valid_coord]
imarray[:,:min_left_coord,:] = 255
imarray[:,max_right_coord:,:] = 255
......
......@@ -337,13 +337,7 @@ def remove_black_sides(img_aux, coord):
valid_side = (ind_left < len(sum_img)/2)
else:
valid_side = (ind_left > (len(sum_img) - ind_right))
if 0:
plt.figure()
plt.plot(sum_img)
plt.plot([0,len(sum_img)],[max_th,max_th])
plt.plot([0,int(len(sum_img)/2)],[min_th_left,min_th_left])
plt.plot([int(len(sum_img)/2),len(sum_img)],[min_th_right,min_th_right])
if valid_side:
inc_start = np.argwhere(sum_img[(ind_left):] > min_th_left)
# We need the following to avoid the ramp, as the threshold is different
......@@ -358,9 +352,18 @@ def remove_black_sides(img_aux, coord):
max_right_coord = int((ind_right + inc_start[ind_inc_start])/2)
#print(ind_right,ind_inc_start,max_right_coord)
# One of the sides is outside the valid, plus the textline is small
if 0:
plt.figure()
plt.plot(sum_img)
plt.plot([0,len(sum_img)],[max_th,max_th])
plt.plot([0,int(len(sum_img)/2)],[min_th_left,min_th_left])
plt.plot([int(len(sum_img)/2),len(sum_img)],[min_th_right,min_th_right])
plt.plot([min_left_coord,min_left_coord],[0,np.max(sum_img)])
plt.plot([max_right_coord,max_right_coord],[0,np.max(sum_img)])
ind_notvalid_coord = np.ravel((np.argwhere(((coord[1,:] < min_left_coord) | (coord[3,:] > max_right_coord)) &
((coord[3,:] - coord[1,:]) < len(sum_img)/10))))
print(ind_notvalid_coord)
#print(ind_notvalid_coord)
ind_valid_coord = np.setdiff1d(np.arange(coord.shape[1]),ind_notvalid_coord)
#img_aux_clean = copy.copy(img_aux)
#img_aux_clean[:,0:min_left_coord] = 0
......
This diff is collapsed.
......@@ -3,13 +3,17 @@ os.environ['DEMOCRASCI_DATA'] = "/Users/luissalamanca/My_stuff/05_SDSCresearch/0
import def_classes as defc
import utils_proc
year = 1982
year = 1911
year = 1922
folder_database = '../../data/AB_other/SessionOverviews_tar/'
folder_database = '/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data/AB_other/SessionOverviews_tar'
#%%
iddoc = '110001467'
iddoc = '110000271'
iddoc = '110000455'
input_file = "./{}/{}.pdf".format(year, iddoc)
doc = defc.Document(input_file, folder_database, flag_type = 3)
......@@ -20,3 +24,21 @@ doc = defc.Document(input_file, folder_database, flag_type = 3)
doc.correct_xml(flag_plots = 1, flag_parallel = 0, flag_save_figs = 1,
pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml',
name_outcorrxml = '04_correctedxml', flag_save = 1)
#%%
name_tar = '00_rawpdfs'
list_docs = utils_proc.get_list(year, folder_database, name_tar)[1]
for l_d in list_docs:
input_file = "./{}/{}.pdf".format(year, l_d)
doc = defc.Document(input_file, folder_database, flag_type = 3)
doc.pdf2xml(pages = 'all', suffix_xml = '_data', flag_save = 1, name_outxml = '02_extractedxml')
# IT IS PICKING ARTIFACT THAT ARE MAKING THE CENTRAL LINE TO LOOK HORRIBLE
# IMPROVE THE CENTRAL LINE CLEANING
doc.correct_xml(flag_plots = 1, flag_parallel = 0, flag_save_figs = 1,
pages = 'all', suffix_xml = '_data', name_outxml = '02_extractedxml',
name_outcorrxml = '04_correctedxml', flag_save = 1)
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment