[ie/facebook:ads] Fix extractor (#10704)

Closes #10701
Authored by: kclauhk
This commit is contained in:
kclauhk 2024-08-16 03:53:37 +08:00 committed by GitHub
parent cc88a54bb1
commit d62fef7e07
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -963,6 +963,7 @@ class FacebookAdsIE(InfoExtractor):
'id': '899206155126718', 'id': '899206155126718',
'ext': 'mp4', 'ext': 'mp4',
'title': 'video by Kandao', 'title': 'video by Kandao',
'description': 'md5:0822724069e3aca97cbed5dabbab282e',
'uploader': 'Kandao', 'uploader': 'Kandao',
'uploader_id': '774114102743284', 'uploader_id': '774114102743284',
'uploader_url': r're:^https?://.*', 'uploader_url': r're:^https?://.*',
@ -971,6 +972,22 @@ class FacebookAdsIE(InfoExtractor):
'upload_date': '20231214', 'upload_date': '20231214',
'like_count': int, 'like_count': int,
}, },
}, {
# key 'watermarked_video_sd_url' missing
'url': 'https://www.facebook.com/ads/library/?id=501152689226254',
'info_dict': {
'id': '501152689226254',
'ext': 'mp4',
'title': 'video by mat.nawrocki',
'description': 'md5:02a446ace7ff8c3c37a2892922492490',
'uploader': 'mat.nawrocki',
'uploader_id': '148586968341456',
'uploader_url': r're:^https?://.*',
'timestamp': 1723452305,
'thumbnail': r're:^https?://.*',
'upload_date': '20240812',
'like_count': int,
},
}, { }, {
'url': 'https://www.facebook.com/ads/library/?id=893637265423481', 'url': 'https://www.facebook.com/ads/library/?id=893637265423481',
'info_dict': { 'info_dict': {
@ -1017,34 +1034,42 @@ class FacebookAdsIE(InfoExtractor):
video_id = self._match_id(url) video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
post_data = [self._parse_json(j, video_id, fatal=False) post_data = traverse_obj(
for j in re.findall(r's\.handle\(({.*})\);requireLazy\(', webpage)] re.findall(r'data-sjs>({.*?ScheduledServerJS.*?})</script>', webpage), (..., {json.loads}))
data = traverse_obj(post_data, ( data = get_first(post_data, (
..., 'require', ..., ..., ..., 'props', 'deeplinkAdCard', 'snapshot', {dict}), get_all=False) 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ...,
'entryPointRoot', 'otherProps', 'deeplinkAdCard', 'snapshot', {dict}))
if not data: if not data:
raise ExtractorError('Unable to extract ad data') raise ExtractorError('Unable to extract ad data')
title = data.get('title') title = data.get('title')
if not title or title == '{{product.name}}': if not title or title == '{{product.name}}':
title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data) title = join_nonempty('display_format', 'page_name', delim=' by ', from_dict=data)
markup_id = traverse_obj(data, ('body', '__m', {str}))
markup = traverse_obj(post_data, (
..., 'require', ..., ..., ..., '__bbox', 'markup', lambda _, v: v[0].startswith(markup_id),
..., '__html', {clean_html}, {lambda x: not x.startswith('{{product.') and x}, any))
info_dict = traverse_obj(data, { info_dict = merge_dicts({
'description': ('link_description', {str}, {lambda x: x if x != '{{product.description}}' else None}), 'title': title,
'description': markup or None,
}, traverse_obj(data, {
'description': ('link_description', {lambda x: x if not x.startswith('{{product.') else None}),
'uploader': ('page_name', {str}), 'uploader': ('page_name', {str}),
'uploader_id': ('page_id', {str_or_none}), 'uploader_id': ('page_id', {str_or_none}),
'uploader_url': ('page_profile_uri', {url_or_none}), 'uploader_url': ('page_profile_uri', {url_or_none}),
'timestamp': ('creation_time', {int_or_none}), 'timestamp': ('creation_time', {int_or_none}),
'like_count': ('page_like_count', {int_or_none}), 'like_count': ('page_like_count', {int_or_none}),
}) }))
entries = [] entries = []
for idx, entry in enumerate(traverse_obj( for idx, entry in enumerate(traverse_obj(
data, (('videos', 'cards'), lambda _, v: any(url_or_none(v[f]) for f in self._FORMATS_MAP))), 1, data, (('videos', 'cards'), lambda _, v: any(url_or_none(v.get(f)) for f in self._FORMATS_MAP))), 1,
): ):
entries.append({ entries.append({
'id': f'{video_id}_{idx}', 'id': f'{video_id}_{idx}',
'title': entry.get('title') or title, 'title': entry.get('title') or title,
'description': entry.get('link_description') or info_dict.get('description'), 'description': traverse_obj(entry, 'body', 'link_description') or info_dict.get('description'),
'thumbnail': url_or_none(entry.get('video_preview_image_url')), 'thumbnail': url_or_none(entry.get('video_preview_image_url')),
'formats': self._extract_formats(entry), 'formats': self._extract_formats(entry),
}) })