Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
Marta Balode
democrasci_preprocWP1
Commits
4ea20064
Commit
4ea20064
authored
Apr 27, 2020
by
Luis Salamanca
Browse files
A more or less usable code for title completion
parent
c32c808e
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
src/python/def_classes.py
View file @
4ea20064
...
...
@@ -304,15 +304,25 @@ class Document:
img_aux
[
img_aux
<
20
]
=
0
img_aux
[
img_aux
>=
20
]
=
255
sum_img
=
np
.
sum
(
img_aux
,
axis
=
0
)
mean_sum_img
=
np
.
mean
(
sum_img
)
*
1.5
num_pix_comp
=
int
(
len
(
sum_img
)
/
20
)
if
(
np
.
mean
(
sum_img
[:
num_pix_comp
]
>
mean_sum_img
)
or
np
.
mean
(
sum_img
[
-
num_pix_comp
:]
>
mean_sum_img
)):
#mean_sum_img = np.mean(sum_img) * 1.5
firt_25
=
int
(
2
*
len
(
sum_img
)
/
5
)
second_25
=
int
(
3
*
len
(
sum_img
)
/
5
)
mean_left
=
1.5
*
np
.
mean
(
sum_img
[:
firt_25
])
mean_right
=
1.5
*
np
.
mean
(
sum_img
[
second_25
:])
num_pix_comp
=
int
(
len
(
sum_img
)
/
20
)
# The larger the denominator, the more likely we remove black margins
if
(
np
.
sum
(
sum_img
[:
firt_25
]
>
mean_left
)
>
num_pix_comp
)
or
(
np
.
sum
(
sum_img
[
second_25
:]
>
mean_right
)
>
num_pix_comp
):
print
(
'Removing black margins caused by scanning'
)
flag_remove
=
True
if
flag_remove
:
coord_ad
,
_
=
plot_tools
.
adapt_coordtoimg
(
imarray
,
coord_textline
,
bbox_page
)
min_left_coord
,
max_right_coord
,
ind_valid_coord
=
preproc_docs
.
remove_black_sides
(
imarray
,
coord_ad
)
try
:
min_left_coord
,
max_right_coord
,
ind_valid_coord
=
preproc_docs
.
remove_black_sides
(
imarray
,
coord_ad
)
except
:
min_left_coord
=
0
max_right_coord
=
imarray
.
shape
[
1
]
ind_valid_coord
=
np
.
arange
(
coord_ad
.
shape
[
1
])
print
(
'Error while removing black margins. Not doing anything!'
)
coord_textline
=
coord_textline
[:,
ind_valid_coord
]
imarray
[:,:
min_left_coord
,:]
=
255
imarray
[:,
max_right_coord
:,:]
=
255
...
...
src/python/preproc_docs.py
View file @
4ea20064
...
...
@@ -337,13 +337,7 @@ def remove_black_sides(img_aux, coord):
valid_side
=
(
ind_left
<
len
(
sum_img
)
/
2
)
else
:
valid_side
=
(
ind_left
>
(
len
(
sum_img
)
-
ind_right
))
if
0
:
plt
.
figure
()
plt
.
plot
(
sum_img
)
plt
.
plot
([
0
,
len
(
sum_img
)],[
max_th
,
max_th
])
plt
.
plot
([
0
,
int
(
len
(
sum_img
)
/
2
)],[
min_th_left
,
min_th_left
])
plt
.
plot
([
int
(
len
(
sum_img
)
/
2
),
len
(
sum_img
)],[
min_th_right
,
min_th_right
])
if
valid_side
:
inc_start
=
np
.
argwhere
(
sum_img
[(
ind_left
):]
>
min_th_left
)
# We need the following to avoid the ramp, as the threshold is different
...
...
@@ -358,9 +352,18 @@ def remove_black_sides(img_aux, coord):
max_right_coord
=
int
((
ind_right
+
inc_start
[
ind_inc_start
])
/
2
)
#print(ind_right,ind_inc_start,max_right_coord)
# One of the sides is outside the valid, plus the textline is small
if
0
:
plt
.
figure
()
plt
.
plot
(
sum_img
)
plt
.
plot
([
0
,
len
(
sum_img
)],[
max_th
,
max_th
])
plt
.
plot
([
0
,
int
(
len
(
sum_img
)
/
2
)],[
min_th_left
,
min_th_left
])
plt
.
plot
([
int
(
len
(
sum_img
)
/
2
),
len
(
sum_img
)],[
min_th_right
,
min_th_right
])
plt
.
plot
([
min_left_coord
,
min_left_coord
],[
0
,
np
.
max
(
sum_img
)])
plt
.
plot
([
max_right_coord
,
max_right_coord
],[
0
,
np
.
max
(
sum_img
)])
ind_notvalid_coord
=
np
.
ravel
((
np
.
argwhere
(((
coord
[
1
,:]
<
min_left_coord
)
|
(
coord
[
3
,:]
>
max_right_coord
))
&
((
coord
[
3
,:]
-
coord
[
1
,:])
<
len
(
sum_img
)
/
10
))))
print
(
ind_notvalid_coord
)
#
print(ind_notvalid_coord)
ind_valid_coord
=
np
.
setdiff1d
(
np
.
arange
(
coord
.
shape
[
1
]),
ind_notvalid_coord
)
#img_aux_clean = copy.copy(img_aux)
#img_aux_clean[:,0:min_left_coord] = 0
...
...
src/python/run_titles_completion.py
0 → 100644
View file @
4ea20064
This diff is collapsed.
Click to expand it.
src/python/test_correct.py
View file @
4ea20064
...
...
@@ -3,13 +3,17 @@ os.environ['DEMOCRASCI_DATA'] = "/Users/luissalamanca/My_stuff/05_SDSCresearch/0
import
def_classes
as
defc
import
utils_proc
year
=
1982
year
=
19
11
year
=
19
22
folder_database
=
'../../data/AB_other/SessionOverviews_tar/'
folder_database
=
'/Users/luissalamanca/Dropbox/My_stuff/05_SDSCresearch/02_NLP/Data/AB_other/SessionOverviews_tar'
#%%
iddoc
=
'110001467'
iddoc
=
'110000
271
'
iddoc
=
'110000
455
'
input_file
=
"./{}/{}.pdf"
.
format
(
year
,
iddoc
)
doc
=
defc
.
Document
(
input_file
,
folder_database
,
flag_type
=
3
)
...
...
@@ -20,3 +24,21 @@ doc = defc.Document(input_file, folder_database, flag_type = 3)
doc
.
correct_xml
(
flag_plots
=
1
,
flag_parallel
=
0
,
flag_save_figs
=
1
,
pages
=
'all'
,
suffix_xml
=
'_data'
,
name_outxml
=
'02_extractedxml'
,
name_outcorrxml
=
'04_correctedxml'
,
flag_save
=
1
)
#%%
name_tar
=
'00_rawpdfs'
list_docs
=
utils_proc
.
get_list
(
year
,
folder_database
,
name_tar
)[
1
]
for
l_d
in
list_docs
:
input_file
=
"./{}/{}.pdf"
.
format
(
year
,
l_d
)
doc
=
defc
.
Document
(
input_file
,
folder_database
,
flag_type
=
3
)
doc
.
pdf2xml
(
pages
=
'all'
,
suffix_xml
=
'_data'
,
flag_save
=
1
,
name_outxml
=
'02_extractedxml'
)
# IT IS PICKING ARTIFACT THAT ARE MAKING THE CENTRAL LINE TO LOOK HORRIBLE
# IMPROVE THE CENTRAL LINE CLEANING
doc
.
correct_xml
(
flag_plots
=
1
,
flag_parallel
=
0
,
flag_save_figs
=
1
,
pages
=
'all'
,
suffix_xml
=
'_data'
,
name_outxml
=
'02_extractedxml'
,
name_outcorrxml
=
'04_correctedxml'
,
flag_save
=
1
)
src/python/test_extendtitles.py
View file @
4ea20064
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment