build: add elecom-product-header for ELECOM devices
[openwrt/openwrt.git] / scripts / dl_github_archive.py
1 #!/usr/bin/env python3
2 #
3 # Copyright (c) 2018 Yousong Zhou <yszhou4tech@gmail.com>
4 #
5 # This is free software, licensed under the GNU General Public License v2.
6 # See /LICENSE for more information.
7
8 import argparse
9 import calendar
10 import datetime
11 import errno
12 import fcntl
13 import hashlib
14 import json
15 import os
16 import os.path
17 import re
18 import shutil
19 import ssl
20 import subprocess
21 import sys
22 import time
23 import urllib.request
24
25 TMPDIR = os.environ.get('TMP_DIR') or '/tmp'
26 TMPDIR_DL = os.path.join(TMPDIR, 'dl')
27
28
29 class PathException(Exception): pass
30 class DownloadGitHubError(Exception): pass
31
32
33 class Path(object):
34 """Context class for preparing and cleaning up directories.
35
36 If ```preclean` is ``False``, ``path`` will NOT be removed on context enter
37
38 If ``path`` ``isdir``, then it will be created on context enter.
39
40 If ``keep`` is True, then ``path`` will NOT be removed on context exit
41 """
42
43 def __init__(self, path, isdir=True, preclean=False, keep=False):
44 self.path = path
45 self.isdir = isdir
46 self.preclean = preclean
47 self.keep = keep
48
49 def __enter__(self):
50 if self.preclean:
51 self.rm_all(self.path)
52 if self.isdir:
53 self.mkdir_all(self.path)
54 return self
55
56 def __exit__(self, exc_type, exc_value, traceback):
57 if not self.keep:
58 self.rm_all(self.path)
59
60 @staticmethod
61 def mkdir_all(path):
62 """Same as mkdir -p."""
63 names = os.path.split(path)
64 p = ''
65 for name in names:
66 p = os.path.join(p, name)
67 Path._mkdir(p)
68
69 @staticmethod
70 def _rmdir_dir(dir_):
71 names = Path._listdir(dir_)
72 for name in names:
73 p = os.path.join(dir_, name)
74 Path.rm_all(p)
75 Path._rmdir(dir_)
76
77 @staticmethod
78 def _mkdir(path):
79 Path._os_func(os.mkdir, path, errno.EEXIST)
80
81 @staticmethod
82 def _rmdir(path):
83 Path._os_func(os.rmdir, path, errno.ENOENT)
84
85 @staticmethod
86 def _remove(path):
87 Path._os_func(os.remove, path, errno.ENOENT)
88
89 @staticmethod
90 def _listdir(path):
91 return Path._os_func(os.listdir, path, errno.ENOENT, default=[])
92
93 @staticmethod
94 def _os_func(func, path, errno, default=None):
95 """Call func(path) in an idempotent way.
96
97 On exception ``ex``, if the type is OSError and ``ex.errno == errno``,
98 return ``default``, otherwise, re-raise
99 """
100 try:
101 return func(path)
102 except OSError as e:
103 if e.errno == errno:
104 return default
105 else:
106 raise
107
108 @staticmethod
109 def rm_all(path):
110 """Same as rm -r."""
111 if os.path.islink(path):
112 Path._remove(path)
113 elif os.path.isdir(path):
114 Path._rmdir_dir(path)
115 else:
116 Path._remove(path)
117
118 @staticmethod
119 def untar(path, into=None):
120 """Extract tarball at ``path`` into subdir ``into``.
121
122 return subdir name if and only if there exists one, otherwise raise PathException
123 """
124 args = ('tar', '-C', into, '-xzf', path, '--no-same-permissions')
125 subprocess.check_call(args, preexec_fn=lambda: os.umask(0o22))
126 dirs = os.listdir(into)
127 if len(dirs) == 1:
128 return dirs[0]
129 else:
130 raise PathException('untar %s: expecting a single subdir, got %s' % (path, dirs))
131
132 @staticmethod
133 def tar(path, subdir, into=None, ts=None):
134 """Pack ``path`` into tarball ``into``."""
135 # --sort=name requires a recent build of GNU tar
136 args = ['tar', '--numeric-owner', '--owner=0', '--group=0', '--sort=name']
137 args += ['-C', path, '-cf', into, subdir]
138 envs = os.environ.copy()
139 if ts is not None:
140 args.append('--mtime=@%d' % ts)
141 if into.endswith('.xz'):
142 envs['XZ_OPT'] = '-7e'
143 args.append('-J')
144 elif into.endswith('.bz2'):
145 args.append('-j')
146 elif into.endswith('.gz'):
147 args.append('-z')
148 envs['GZIP'] = '-n'
149 else:
150 raise PathException('unknown compression type %s' % into)
151 subprocess.check_call(args, env=envs)
152
153
154 class GitHubCommitTsCache(object):
155 __cachef = 'github.commit.ts.cache'
156 __cachen = 2048
157
158 def __init__(self):
159 Path.mkdir_all(TMPDIR_DL)
160 self.cachef = os.path.join(TMPDIR_DL, self.__cachef)
161 self.cache = {}
162
163 def get(self, k):
164 """Get timestamp with key ``k``."""
165 fileno = os.open(self.cachef, os.O_RDONLY | os.O_CREAT)
166 with os.fdopen(fileno) as fin:
167 try:
168 fcntl.lockf(fileno, fcntl.LOCK_SH)
169 self._cache_init(fin)
170 if k in self.cache:
171 ts = self.cache[k][0]
172 return ts
173 finally:
174 fcntl.lockf(fileno, fcntl.LOCK_UN)
175 return None
176
177 def set(self, k, v):
178 """Update timestamp with ``k``."""
179 fileno = os.open(self.cachef, os.O_RDWR | os.O_CREAT)
180 with os.fdopen(fileno, 'w+') as f:
181 try:
182 fcntl.lockf(fileno, fcntl.LOCK_EX)
183 self._cache_init(f)
184 self.cache[k] = (v, int(time.time()))
185 self._cache_flush(f)
186 finally:
187 fcntl.lockf(fileno, fcntl.LOCK_UN)
188
189 def _cache_init(self, fin):
190 for line in fin:
191 k, ts, updated = line.split()
192 ts = int(ts)
193 updated = int(updated)
194 self.cache[k] = (ts, updated)
195
196 def _cache_flush(self, fout):
197 cache = sorted(self.cache.items(), key=lambda a: a[1][1])
198 cache = cache[:self.__cachen]
199 self.cache = {}
200 os.ftruncate(fout.fileno(), 0)
201 fout.seek(0, os.SEEK_SET)
202 for k, ent in cache:
203 ts = ent[0]
204 updated = ent[1]
205 line = '{0} {1} {2}\n'.format(k, ts, updated)
206 fout.write(line)
207
208
209 class DownloadGitHubTarball(object):
210 """Download and repack archive tarabll from GitHub.
211
212 Compared with the method of packing after cloning the whole repo, this
213 method is more friendly to users with fragile internet connection.
214
215 However, there are limitations with this method
216
217 - GitHub imposes a 60 reqs/hour limit for unauthenticated API access.
218 This affects fetching commit date for reproducible tarballs. Download
219 through the archive link is not affected.
220
221 - GitHub archives do not contain source codes for submodules.
222
223 - GitHub archives seem to respect .gitattributes and ignore pathes with
224 export-ignore attributes.
225
226 For the first two issues, the method will fail loudly to allow fallback to
227 clone-then-pack method.
228
229 As for the 3rd issue, to make sure that this method only produces identical
230 tarballs as the fallback method, we require the expected hash value to be
231 supplied. That means the first tarball will need to be prepared by the
232 clone-then-pack method
233 """
234
235 __repo_url_regex = re.compile(r'^(?:https|git)://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)')
236
237 def __init__(self, args):
238 self.dl_dir = args.dl_dir
239 self.version = args.version
240 self.subdir = args.subdir
241 self.source = args.source
242 self.url = args.url
243 self._init_owner_repo()
244 self.xhash = args.hash
245 self._init_hasher()
246 self.commit_ts = None # lazy load commit timestamp
247 self.commit_ts_cache = GitHubCommitTsCache()
248 self.name = 'github-tarball'
249
250 def download(self):
251 """Download and repack GitHub archive tarball."""
252 self._init_commit_ts()
253 with Path(TMPDIR_DL, keep=True) as dir_dl:
254 # fetch tarball from GitHub
255 tarball_path = os.path.join(dir_dl.path, self.subdir + '.tar.gz.dl')
256 with Path(tarball_path, isdir=False):
257 self._fetch(tarball_path)
258 # unpack
259 d = os.path.join(dir_dl.path, self.subdir + '.untar')
260 with Path(d, preclean=True) as dir_untar:
261 tarball_prefix = Path.untar(tarball_path, into=dir_untar.path)
262 dir0 = os.path.join(dir_untar.path, tarball_prefix)
263 dir1 = os.path.join(dir_untar.path, self.subdir)
264 # submodules check
265 if self._has_submodule(dir0):
266 raise self._error('Fetching submodules is not yet supported')
267 # rename subdir
268 os.rename(dir0, dir1)
269 # repack
270 into=os.path.join(TMPDIR_DL, self.source)
271 Path.tar(dir_untar.path, self.subdir, into=into, ts=self.commit_ts)
272 try:
273 self._hash_check(into)
274 except Exception:
275 Path.rm_all(into)
276 raise
277 # move to target location
278 file1 = os.path.join(self.dl_dir, self.source)
279 if into != file1:
280 shutil.move(into, file1)
281
282 def _has_submodule(self, dir_):
283 m = os.path.join(dir_, '.gitmodules')
284 try:
285 st = os.stat(m)
286 return st.st_size > 0
287 except OSError as e:
288 return e.errno != errno.ENOENT
289
290 def _init_owner_repo(self):
291 m = self.__repo_url_regex.search(self.url)
292 if m is None:
293 raise self._error('Invalid github url: {}'.format(self.url))
294 owner = m.group('owner')
295 repo = m.group('repo')
296 if repo.endswith('.git'):
297 repo = repo[:-4]
298 self.owner = owner
299 self.repo = repo
300
301 def _init_hasher(self):
302 xhash = self.xhash
303 if len(xhash) == 64:
304 self.hasher = hashlib.sha256()
305 elif len(xhash) == 32:
306 self.hasher = hashlib.md5()
307 else:
308 raise self._error('Requires sha256sum for verification')
309 self.xhash = xhash
310
311 def _hash_check(self, f):
312 with open(f, 'rb') as fin:
313 while True:
314 d = fin.read(4096)
315 if not d:
316 break
317 self.hasher.update(d)
318 xhash = self.hasher.hexdigest()
319 if xhash != self.xhash:
320 raise self._error('Wrong hash (probably caused by .gitattributes), expecting {}, got {}'.format(self.xhash, xhash))
321
322 def _init_commit_ts(self):
323 if self.commit_ts is not None:
324 return
325 # GitHub provides 2 APIs[1,2] for fetching commit data. API[1] is more
326 # terse while API[2] provides more verbose info such as commit diff
327 # etc. That's the main reason why API[1] is preferred: the response
328 # size is predictable.
329 #
330 # However, API[1] only accepts complete commit sha1sum as the parameter
331 # while API[2] is more liberal accepting also partial commit id and
332 # tags, etc.
333 #
334 # [1] Get a single commit, Repositories, https://developer.github.com/v3/repos/commits/#get-a-single-commit
335 # [2] Git Commits, Git Data, https://developer.github.com/v3/git/commits/#get-a-commit
336 apis = [
337 {
338 'url': self._make_repo_url_path('git', 'commits', self.version),
339 'attr_path': ('committer', 'date'),
340 }, {
341 'url': self._make_repo_url_path('commits', self.version),
342 'attr_path': ('commit', 'committer', 'date'),
343 },
344 ]
345 version_is_sha1sum = len(self.version) == 40
346 if not version_is_sha1sum:
347 apis.insert(0, apis.pop())
348 reasons = ''
349 for api in apis:
350 url = api['url']
351 attr_path = api['attr_path']
352 try:
353 ct = self.commit_ts_cache.get(url)
354 if ct is not None:
355 self.commit_ts = ct
356 return
357 ct = self._init_commit_ts_remote_get(url, attr_path)
358 self.commit_ts = ct
359 self.commit_ts_cache.set(url, ct)
360 return
361 except Exception as e:
362 reasons += '\n' + (" {}: {}".format(url, e))
363 raise self._error('Cannot fetch commit ts:{}'.format(reasons))
364
365 def _init_commit_ts_remote_get(self, url, attrpath):
366 resp = self._make_request(url)
367 data = resp.read()
368 date = json.loads(data)
369 for attr in attrpath:
370 date = date[attr]
371 date = datetime.datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ')
372 date = date.timetuple()
373 ct = calendar.timegm(date)
374 return ct
375
376 def _fetch(self, path):
377 """Fetch tarball of the specified version ref."""
378 ref = self.version
379 url = self._make_repo_url_path('tarball', ref)
380 resp = self._make_request(url)
381 with open(path, 'wb') as fout:
382 while True:
383 d = resp.read(4096)
384 if not d:
385 break
386 fout.write(d)
387
388 def _make_repo_url_path(self, *args):
389 url = '/repos/{0}/{1}'.format(self.owner, self.repo)
390 if args:
391 url += '/' + '/'.join(args)
392 return url
393
394 def _make_request(self, path):
395 """Request GitHub API endpoint on ``path``."""
396 url = 'https://api.github.com' + path
397 headers = {
398 'Accept': 'application/vnd.github.v3+json',
399 'User-Agent': 'OpenWrt',
400 }
401 req = urllib.request.Request(url, headers=headers)
402 sslcontext = ssl._create_unverified_context()
403 fileobj = urllib.request.urlopen(req, context=sslcontext)
404 return fileobj
405
406 def _error(self, msg):
407 return DownloadGitHubError('{}: {}'.format(self.source, msg))
408
409
410 def main():
411 parser = argparse.ArgumentParser()
412 parser.add_argument('--dl-dir', default=os.getcwd(), help='Download dir')
413 parser.add_argument('--url', help='Download URL')
414 parser.add_argument('--subdir', help='Source code subdir name')
415 parser.add_argument('--version', help='Source code version')
416 parser.add_argument('--source', help='Source tarball filename')
417 parser.add_argument('--hash', help='Source tarball\'s expected sha256sum')
418 args = parser.parse_args()
419 try:
420 method = DownloadGitHubTarball(args)
421 method.download()
422 except Exception as ex:
423 sys.stderr.write('{}: Download from {} failed\n'.format(args.source, args.url))
424 sys.stderr.write('{}\n'.format(ex))
425 sys.exit(1)
426
427 if __name__ == '__main__':
428 main()