Manual de usuario de calibre, Publicación 2.73.0
# resulting in a lot of extra things to be removed by remove_tags.
blq_content_reg_exp = '^.*blq[_ -]*content.*$'
# The BBC has an alternative page design structure, which I suspect is an out-ofdate
# design but which is still used in some articles, Eg. 'Click' (technology),
˓→'FastTrack'
# (travel), and in some sport pages. These alternative pages are table based
˓→(which is
# why I think they are an out-of-date design) and account for -I'm guesstimaking˓→less
# than 1% of all articles. They use a table class 'storycontent' to hold the
˓→article
# and like blq_content (above) have required lots of extra removal by
# remove_tags.
story_content_reg_exp = '^.*story[_ -]*content.*$'
˓→
# Keep the sections of the HTML which match the list below. The HTML page created
by
# Calibre will fill with those sections which are matched. Note that the
# blq_content_reg_exp must be listed before storybody_reg_exp in keep_only_tags
˓→due to
# it being the parent of storybody_reg_exp, that is to say the div class/id
˓→'story-body'
# will be inside div class/id 'blq_content' in the HTML (if 'blq_content' is
˓→there at
# all). If they are the other way around in keep_only_tags then blq_content_reg_
˓→exp
# will end up being discarded.
keep_only_tags = [dict(name='table', attrs={'class': re.compile(story_content_reg_
˓→exp, re.IGNORECASE)}),
dict(name='div',
attrs={'class': re.compile(
blq_content_reg_exp, re.IGNORECASE)}),
dict(name='div',
attrs={'id': re.compile(
blq_content_reg_exp, re.IGNORECASE)}),
dict(name='div',
attrs={'class': re.compile(
storybody_reg_exp, re.IGNORECASE)}),
dict(name='div',
attrs={'id': re.compile(storybody_reg_exp,
˓→re.IGNORECASE)})]
˓→
# ************************************
# Regular expressions for remove_tags:
# ************************************
# Regular expression to remove share-help and variant tags. The share-help class
# is used by the site for a variety of 'sharing' type links, Eg. Facebook,
˓→delicious,
# twitter, email. Removed to avoid page clutter.
share_help_reg_exp = '^.*share[_ -]*help.*$'
˓→
# Regular expression to remove embedded-hyper and variant tags. This class is
used to
# display links to other BBC News articles on the same/similar subject.
embedded_hyper_reg_exp = '^.*embed*ed[_ -]*hyper.*$'
# Regular expression to remove hypertabs and variant tags. This class is used to
# display a tab bar at the top of an article which allows the user to switch to
# an article (viewed on the same page) providing further info., 'in depth'
˓→analysis,
36
Capítulo 1. Secciones