Parse metadata from multiple fields

Closes #196
pull/310/head
pukkandan 4 years ago
parent 3700c7ef10
commit 143db31d48
No known key found for this signature in database
GPG Key ID: 0F00D95A001F4698

@ -670,18 +670,24 @@ Then simply run `make`. You can also run `make yt-dlp` instead to compile only t
--add-metadata Write metadata to the video file --add-metadata Write metadata to the video file
--no-add-metadata Do not write metadata (default) --no-add-metadata Do not write metadata (default)
--parse-metadata FIELD:FORMAT Parse additional metadata like title/artist --parse-metadata FIELD:FORMAT Parse additional metadata like title/artist
from other fields. Give field name to from other fields. Give a template or field
extract data from, and format of the field name to extract data from and the format to
seperated by a ":". Either regular interpret it as, seperated by a ":". Either
expression with named capture groups or a regular expression with named capture
similar syntax to the output template can groups or a similar syntax to the output
also be used. The parsed parameters replace template can be used for the FORMAT.
any existing values and can be use in Similarly, the syntax for output template
output template. This option can be used can be used for FIELD to parse the data
multiple times. Example: --parse-metadata from multiple fields. The parsed parameters
"title:%(artist)s - %(title)s" matches a replace any existing values and can be used
title like "Coldplay - Paradise". Example in output templates. This option can be
(regex): --parse-metadata used multiple times. Example: --parse-
metadata "title:%(artist)s - %(title)s"
matches a title like "Coldplay - Paradise".
Example: --parse-metadata "%(series)s
%(episode_number)s:%(title)s" sets the
title using series and episode number.
Example (regex): --parse-metadata
"description:Artist - (?P<artist>.+?)" "description:Artist - (?P<artist>.+?)"
--xattrs Write metadata to the video file's xattrs --xattrs Write metadata to the video file's xattrs
(using dublin core and xdg standards) (using dublin core and xdg standards)

@ -67,6 +67,7 @@ from .utils import (
float_or_none, float_or_none,
format_bytes, format_bytes,
format_field, format_field,
FORMAT_RE,
formatSeconds, formatSeconds,
GeoRestrictedError, GeoRestrictedError,
int_or_none, int_or_none,
@ -772,95 +773,93 @@ class YoutubeDL(object):
'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.')
return outtmpl_dict return outtmpl_dict
def prepare_outtmpl(self, outtmpl, info_dict, sanitize=None):
""" Make the template and info_dict suitable for substitution (outtmpl % info_dict)"""
template_dict = dict(info_dict)
# duration_string
template_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
formatSeconds(info_dict['duration'], '-')
if info_dict.get('duration', None) is not None
else None)
# epoch
template_dict['epoch'] = int(time.time())
# autonumber
autonumber_size = self.params.get('autonumber_size')
if autonumber_size is None:
autonumber_size = 5
template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
# resolution if not defined
if template_dict.get('resolution') is None:
if template_dict.get('width') and template_dict.get('height'):
template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
elif template_dict.get('height'):
template_dict['resolution'] = '%sp' % template_dict['height']
elif template_dict.get('width'):
template_dict['resolution'] = '%dx?' % template_dict['width']
if sanitize is None:
sanitize = lambda k, v: v
template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
for k, v in template_dict.items()
if v is not None and not isinstance(v, (list, tuple, dict)))
na = self.params.get('outtmpl_na_placeholder', 'NA')
template_dict = collections.defaultdict(lambda: na, template_dict)
# For fields playlist_index and autonumber convert all occurrences
# of %(field)s to %(field)0Nd for backward compatibility
field_size_compat_map = {
'playlist_index': len(str(template_dict['n_entries'])),
'autonumber': autonumber_size,
}
FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
if mobj:
outtmpl = re.sub(
FIELD_SIZE_COMPAT_RE,
r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
outtmpl)
numeric_fields = list(self._NUMERIC_FIELDS)
# Format date
FORMAT_DATE_RE = FORMAT_RE.format(r'(?P<key>(?P<field>\w+)>(?P<format>.+?))')
for mobj in re.finditer(FORMAT_DATE_RE, outtmpl):
conv_type, field, frmt, key = mobj.group('type', 'field', 'format', 'key')
if key in template_dict:
continue
value = strftime_or_none(template_dict.get(field), frmt, na)
if conv_type in 'crs': # string
value = sanitize(field, value)
else: # number
numeric_fields.append(key)
value = float_or_none(value, default=None)
if value is not None:
template_dict[key] = value
# Missing numeric fields used together with integer presentation types
# in format specification will break the argument substitution since
# string NA placeholder is returned for missing fields. We will patch
# output template for missing fields to meet string presentation type.
for numeric_field in numeric_fields:
if numeric_field not in template_dict:
outtmpl = re.sub(
FORMAT_RE.format(re.escape(numeric_field)),
r'%({0})s'.format(numeric_field), outtmpl)
return outtmpl, template_dict
def _prepare_filename(self, info_dict, tmpl_type='default'): def _prepare_filename(self, info_dict, tmpl_type='default'):
try: try:
template_dict = dict(info_dict)
template_dict['duration_string'] = ( # %(duration>%H-%M-%S)s is wrong if duration > 24hrs
formatSeconds(info_dict['duration'], '-')
if info_dict.get('duration', None) is not None
else None)
template_dict['epoch'] = int(time.time())
autonumber_size = self.params.get('autonumber_size')
if autonumber_size is None:
autonumber_size = 5
template_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads
if template_dict.get('resolution') is None:
if template_dict.get('width') and template_dict.get('height'):
template_dict['resolution'] = '%dx%d' % (template_dict['width'], template_dict['height'])
elif template_dict.get('height'):
template_dict['resolution'] = '%sp' % template_dict['height']
elif template_dict.get('width'):
template_dict['resolution'] = '%dx?' % template_dict['width']
sanitize = lambda k, v: sanitize_filename( sanitize = lambda k, v: sanitize_filename(
compat_str(v), compat_str(v),
restricted=self.params.get('restrictfilenames'), restricted=self.params.get('restrictfilenames'),
is_id=(k == 'id' or k.endswith('_id'))) is_id=(k == 'id' or k.endswith('_id')))
template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v))
for k, v in template_dict.items()
if v is not None and not isinstance(v, (list, tuple, dict)))
na = self.params.get('outtmpl_na_placeholder', 'NA')
template_dict = collections.defaultdict(lambda: na, template_dict)
outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default']) outtmpl = self.outtmpl_dict.get(tmpl_type, self.outtmpl_dict['default'])
force_ext = OUTTMPL_TYPES.get(tmpl_type) outtmpl, template_dict = self.prepare_outtmpl(outtmpl, info_dict, sanitize)
# For fields playlist_index and autonumber convert all occurrences
# of %(field)s to %(field)0Nd for backward compatibility
field_size_compat_map = {
'playlist_index': len(str(template_dict['n_entries'])),
'autonumber': autonumber_size,
}
FIELD_SIZE_COMPAT_RE = r'(?<!%)%\((?P<field>autonumber|playlist_index)\)s'
mobj = re.search(FIELD_SIZE_COMPAT_RE, outtmpl)
if mobj:
outtmpl = re.sub(
FIELD_SIZE_COMPAT_RE,
r'%%(\1)0%dd' % field_size_compat_map[mobj.group('field')],
outtmpl)
# As of [1] format syntax is:
# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
FORMAT_RE = r'''(?x)
(?<!%)
%
\({0}\) # mapping key
(?:[#0\-+ ]+)? # conversion flags (optional)
(?:\d+)? # minimum field width (optional)
(?:\.\d+)? # precision (optional)
[hlL]? # length modifier (optional)
(?P<type>[diouxXeEfFgGcrs%]) # conversion type
'''
numeric_fields = list(self._NUMERIC_FIELDS)
# Format date
FORMAT_DATE_RE = FORMAT_RE.format(r'(?P<key>(?P<field>\w+)>(?P<format>.+?))')
for mobj in re.finditer(FORMAT_DATE_RE, outtmpl):
conv_type, field, frmt, key = mobj.group('type', 'field', 'format', 'key')
if key in template_dict:
continue
value = strftime_or_none(template_dict.get(field), frmt, na)
if conv_type in 'crs': # string
value = sanitize(field, value)
else: # number
numeric_fields.append(key)
value = float_or_none(value, default=None)
if value is not None:
template_dict[key] = value
# Missing numeric fields used together with integer presentation types
# in format specification will break the argument substitution since
# string NA placeholder is returned for missing fields. We will patch
# output template for missing fields to meet string presentation type.
for numeric_field in numeric_fields:
if numeric_field not in template_dict:
outtmpl = re.sub(
FORMAT_RE.format(re.escape(numeric_field)),
r'%({0})s'.format(numeric_field), outtmpl)
# expand_path translates '%%' into '%' and '$$' into '$' # expand_path translates '%%' into '%' and '$$' into '$'
# correspondingly that is not what we want since we need to keep # correspondingly that is not what we want since we need to keep
@ -875,6 +874,7 @@ class YoutubeDL(object):
# title "Hello $PATH", we don't want `$PATH` to be expanded. # title "Hello $PATH", we don't want `$PATH` to be expanded.
filename = expand_path(outtmpl).replace(sep, '') % template_dict filename = expand_path(outtmpl).replace(sep, '') % template_dict
force_ext = OUTTMPL_TYPES.get(tmpl_type)
if force_ext is not None: if force_ext is not None:
filename = replace_extension(filename, force_ext, template_dict.get('ext')) filename = replace_extension(filename, force_ext, template_dict.get('ext'))

@ -1147,13 +1147,18 @@ def parseOpts(overrideArguments=None):
metavar='FIELD:FORMAT', dest='metafromfield', action='append', metavar='FIELD:FORMAT', dest='metafromfield', action='append',
help=( help=(
'Parse additional metadata like title/artist from other fields. ' 'Parse additional metadata like title/artist from other fields. '
'Give field name to extract data from, and format of the field seperated by a ":". ' 'Give a template or field name to extract data from and the '
'format to interpret it as, seperated by a ":". '
'Either regular expression with named capture groups or a ' 'Either regular expression with named capture groups or a '
'similar syntax to the output template can also be used. ' 'similar syntax to the output template can be used for the FORMAT. '
'The parsed parameters replace any existing values and can be use in output template. ' 'Similarly, the syntax for output template can be used for FIELD '
'to parse the data from multiple fields. '
'The parsed parameters replace any existing values and can be used in output templates. '
'This option can be used multiple times. ' 'This option can be used multiple times. '
'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like ' 'Example: --parse-metadata "title:%(artist)s - %(title)s" matches a title like '
'"Coldplay - Paradise". ' '"Coldplay - Paradise". '
'Example: --parse-metadata "%(series)s %(episode_number)s:%(title)s" '
'sets the title using series and episode number. '
'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"')) 'Example (regex): --parse-metadata "description:Artist - (?P<artist>.+?)"'))
postproc.add_option( postproc.add_option(
'--xattrs', '--xattrs',

@ -8,7 +8,7 @@ from ..utils import str_or_none
class MetadataFromFieldPP(PostProcessor): class MetadataFromFieldPP(PostProcessor):
regex = r'(?P<field>\w+):(?P<format>.+)$' regex = r'(?P<in>.+):(?P<out>.+)$'
def __init__(self, downloader, formats): def __init__(self, downloader, formats):
PostProcessor.__init__(self, downloader) PostProcessor.__init__(self, downloader)
@ -19,11 +19,20 @@ class MetadataFromFieldPP(PostProcessor):
match = re.match(self.regex, f) match = re.match(self.regex, f)
assert match is not None assert match is not None
self._data.append({ self._data.append({
'field': match.group('field'), 'in': match.group('in'),
'format': match.group('format'), 'out': match.group('out'),
'regex': self.format_to_regex(match.group('format'))}) 'tmpl': self.field_to_template(match.group('in')),
'regex': self.format_to_regex(match.group('out')),
})
def format_to_regex(self, fmt): @staticmethod
def field_to_template(tmpl):
if re.match(r'\w+$', tmpl):
return '%%(%s)s' % tmpl
return tmpl
@staticmethod
def format_to_regex(fmt):
r""" r"""
Converts a string like Converts a string like
'%(title)s - %(artist)s' '%(title)s - %(artist)s'
@ -37,7 +46,7 @@ class MetadataFromFieldPP(PostProcessor):
# replace %(..)s with regex group and escape other string parts # replace %(..)s with regex group and escape other string parts
for match in re.finditer(r'%\((\w+)\)s', fmt): for match in re.finditer(r'%\((\w+)\)s', fmt):
regex += re.escape(fmt[lastpos:match.start()]) regex += re.escape(fmt[lastpos:match.start()])
regex += r'(?P<' + match.group(1) + r'>[^\r\n]+)' regex += r'(?P<%s>[^\r\n]+)' % match.group(1)
lastpos = match.end() lastpos = match.end()
if lastpos < len(fmt): if lastpos < len(fmt):
regex += re.escape(fmt[lastpos:]) regex += re.escape(fmt[lastpos:])
@ -45,22 +54,16 @@ class MetadataFromFieldPP(PostProcessor):
def run(self, info): def run(self, info):
for dictn in self._data: for dictn in self._data:
field, regex = dictn['field'], dictn['regex'] tmpl, info_copy = self._downloader.prepare_outtmpl(dictn['tmpl'], info)
if field not in info: data_to_parse = tmpl % info_copy
self.report_warning('Video doesnot have a %s' % field) self.write_debug('Searching for r"%s" in %s' % (dictn['regex'], tmpl))
continue match = re.search(dictn['regex'], data_to_parse)
data_to_parse = str_or_none(info[field])
if data_to_parse is None:
self.report_warning('Field %s cannot be parsed' % field)
continue
self.write_debug('Searching for r"%s" in %s' % (regex, field))
match = re.search(regex, data_to_parse)
if match is None: if match is None:
self.report_warning('Could not interpret video %s as "%s"' % (field, dictn['format'])) self.report_warning('Could not interpret video %s as "%s"' % (dictn['in'], dictn['out']))
continue continue
for attribute, value in match.groupdict().items(): for attribute, value in match.groupdict().items():
info[attribute] = value info[attribute] = value
self.to_screen('parsed %s from %s: %s' % (attribute, field, value if value is not None else 'NA')) self.to_screen('parsed %s from "%s": %s' % (attribute, dictn['in'], value if value is not None else 'NA'))
return [], info return [], info

@ -4205,6 +4205,20 @@ OUTTMPL_TYPES = {
'pl_infojson': 'info.json', 'pl_infojson': 'info.json',
} }
# As of [1] format syntax is:
# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
FORMAT_RE = r'''(?x)
(?<!%)
%
\({0}\) # mapping key
(?:[#0\-+ ]+)? # conversion flags (optional)
(?:\d+)? # minimum field width (optional)
(?:\.\d+)? # precision (optional)
[hlL]? # length modifier (optional)
(?P<type>[diouxXeEfFgGcrs%]) # conversion type
'''
def limit_length(s, length): def limit_length(s, length):
""" Add ellipses to overly long strings """ """ Add ellipses to overly long strings """

Loading…
Cancel
Save