# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 # # Permission is hereby granted, free of charge, to any person obtaining a copy of this # software and associated documentation files (the "Software"), to deal in the Software # without restriction, including without limitation the rights to use, copy, modify, # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. import os import urllib3 from pprint import pprint from urllib.parse import urlparse import re http = urllib3.PoolManager() class HlsVodAsset: def __init__(self, masterManifest, authHeaders=None): self.masterManifest = masterManifest self.masterManifestContentType = None self.variantManifests = None self.variantManifestsData = {} self.mediaSegmentList = [] self.commonPrefix = None self.allResources = None self.authHeaders = authHeaders self.parseHlsVodAsset() # Function will parse variant manifest and extract a list of all media and init segments # media and init segments will store absolute URLs for segments in mediaSegmentList def parseHlsVodAsset( self ): # Retrieve Master Manifest (masterManifestBody, self.masterManifestContentType) = getManifest( self.masterManifest, self.authHeaders ) #TODO: If manifest is None, raise error # Parse Master Manifest self.variantManifests = parseMasterManifest( self.masterManifest, masterManifestBody ) # For each variant manifest for variant in self.variantManifests: # Retrieve Variant Manifest (variantManifestBody, variantContentType) = getManifest( variant, self.authHeaders ) self.variantManifestsData[variant] = { "body": variantManifestBody, "contentType": variantContentType } # Parse Variant Manifest segments = parseVariantManifest( variant, variantManifestBody ) self.mediaSegmentList.extend(segments) # Determine commonPrefix across all resource allResources = [ self.masterManifest ] allResources.extend(self.variantManifests) allResources.extend(self.mediaSegmentList) # Duplicates need to be removed from the list of all segments. # This may not strictly be necessary for HLS/CMAF streams but cannot hurt. allResourcesSet = set(allResources) uniqueResources = list(allResourcesSet) self.allResources = uniqueResources # Set common prefix # The common prefix must end with '/' to indicate this is a path and does not include # the start of the name of the files. For example, if all the resources of an asset start # with 'asset1' and the content is stored in 'my/asset/path' the common prefix should be # 'my/asset/path/' not 'my/asset/path/asset1' commonPrefix = os.path.commonprefix( uniqueResources ) if not commonPrefix.endswith('/'): # Strip off anything to the right of the last '/' because this component represents # the common string at the start of the filenames unwantedPathSuffix = os.path.basename(commonPrefix) pathSuffix = re.compile( unwantedPathSuffix + '$' ) commonPrefix = pathSuffix.sub('', commonPrefix) self.commonPrefix = commonPrefix return def getManifest( url, authHeaders ): contentType = None try: response = http.request( "GET", url, headers=authHeaders ) except IOError as urlErr: print("Exception occurred while attempting to get: %s" % url ) print(repr(urlErr)) urlPayload = None raise(urlErr) if response.status != 200: urlPayload = None print('http error', response.status, 'fetching', url) else: urlPayload = response.data contentType = response.headers['Content-Type'] # Some packagers set the manifest type incorrectly. # This needs to be corrected if the content type is 'binary/octet-stream' if contentType == 'binary/octet-stream': print("Content type was '%s', overriding to '%s'" % (contentType, 'application/x-mpegURL')) contentType = 'application/x-mpegURL' # Not all servers return a 'Content-Length' header. If available it is worth checking if 'Content-Length' in response.headers.keys(): expectedLen = int(response.headers['Content-Length']) receivedLen = len(urlPayload) if receivedLen != expectedLen: print('HlsVodAsset: ', url, 'expected', expectedLen, '; received', receivedLen) urlPayload = None if not( urlPayload is None ): urlPayload = urlPayload.decode('utf-8') # Check if it's an HLS manifest if urlPayload[0:7] != '#EXTM3U': print('Not an HLS manifest:', url) os._exit(2) return ( urlPayload, contentType ) # Function will parse master manifest and extract a list of all the variant manifests # Variant manifest URLs will be stored in list in 'variantManifest' def parseMasterManifest( masterManifestUrl, masterManifestBody ): variantsDict = {} for line in masterManifestBody.splitlines(): name = None # Parse line starting with EXT-X-MEDIA # e.g. EXT-X-MEDIA:TYPE=AUDIO,GROUP-ID="audio_0",CHANNELS="2",NAME="und",LANGUAGE="und",DEFAULT=YES, \ # AUTOSELECT=YES,URI="bf4fc289ea7a4a9a8030bfdfb6dd8180/75449fe7ed1a492880193067011 if line.startswith('#EXT-X-MEDIA:') or line.startswith('#EXT-X-I-FRAME-STREAM-INF:'): line = line.split(':', 1)[1] mediaDict = {} # Add EXT-X-MEDIA properties to data set for keyVal in re.split(r',\s*(?=(?:[^"]*"[^"]*")*[^"]*$)', line): (key, val) = keyVal.split('=', 1) mediaDict[key] = val.strip('"') name = mediaDict['URI'] elif line == "": # Skip blank lines next # Parse lines which do not start with a comment # e.g. ../../../bf4fc289ea7a4a9a8030bfdfb6dd8180/75449fe7ed1a49288019306701174382/index_1_0.ts elif line[0] != '#': name = line # Add key to dict if it has not been seen before absoluteUrl = name if name and name.startswith("http"): absoluteUrl = name elif name: absoluteUrl = normalizeUrl("%s/%s" % (os.path.dirname(masterManifestUrl), name)) if not (absoluteUrl is None or absoluteUrl in variantsDict.keys()): variantsDict[absoluteUrl] = 1 variants = list(variantsDict.keys()) return variants # Normalises url and removes additional '..' notations def normalizeUrl( url ): o = urlparse(url) absPath = os.path.normpath( o.path ) absUrl = "%s://%s%s" % (o.scheme, o.netloc, absPath) return absUrl # Function will parse variant manifest and extract a list of all media and init segments # media and init segments will store absolute URLs for segments in mediaSegmentList def parseVariantManifest( variantManifestUrl, variantManifestBody ): segmentsDict = {} for line in variantManifestBody.splitlines(): name = None # Parse line starting with EXT-X-MEDIA # e.g. #EXT-X-MAP:URI="../../../a595fd669f4349e1846efee6e27ccfa8/bba5843ebf8f41619348551669b17f47/index_video_1_init.mp4" if line.startswith('#EXT-X-MAP:') or line.startswith('#EXT-X-I-FRAME-STREAM-INF:'): line = line.split(':', 1)[1] mediaDict = {} # Add EXT-X-MEDIA properties to data set for keyVal in re.split(r',\s*(?=(?:[^"]*"[^"]*")*[^"]*$)', line): (key, val) = keyVal.split('=', 1) mediaDict[key] = val.strip('"') name = mediaDict['URI'] # Parse lines which do not start with a comment # e.g. ../../../bf4fc289ea7a4a9a8030bfdfb6dd8180/75449fe7ed1a49288019306701174382/index_1_0.ts elif line[0] != '#': name = line # Add key to dict if it has not been seen before absoluteUrl = name if name and name.startswith("http"): absoluteUrl = name elif name: absoluteUrl = normalizeUrl("%s/%s" % (os.path.dirname(variantManifestUrl), name)) if not (absoluteUrl is None or absoluteUrl in segmentsDict.keys()): segmentsDict[absoluteUrl] = 1 segments = list(segmentsDict.keys()) return segments