Spaces:

GGSheng
/

ai

Running

App Files Files Community

ai / bt-source /panel /mod /base /pubsuffix /__init__.py

GGSheng

feat: deploy Gemma 4 to hf space

17e971c verified 14 days ago

raw

history blame contribute delete

15.6 kB

	# -- coding: utf-8 --
	#
	# Copyright (c) 2019 nexB Inc. and Renée Burton
	# Copyright (c) 2015 nexB Inc.
	# This code is based on Tomaž Šolc's fork of David Wilson's code originally at
	# https://www.tablix.org/~avian/git/publicsuffix.git
	#
	# Copyright (c) 2014 Tomaž Šolc <tomaz.solc@tablix.org>
	#
	# David Wilson's code was originally at:
	# from http://code.google.com/p/python-public-suffix-list/
	#
	# Copyright (c) 2009 David Wilson
	#
	# Permission is hereby granted, free of charge, to any person obtaining a
	# copy of this software and associated documentation files (the "Software"),
	# to deal in the Software without restriction, including without limitation
	# the rights to use, copy, modify, merge, publish, distribute, sublicense,
	# and/or sell copies of the Software, and to permit persons to whom the
	# Software is furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	# DEALINGS IN THE SOFTWARE.
	#
	# The Public Suffix List vendored in this distribution has been downloaded
	# from http://publicsuffix.org/public_suffix_list.dat
	# This data file is licensed under the MPL-2.0 license.
	# http://mozilla.org/MPL/2.0/

	"""
	Public Suffix List module for Python.
	"""

	from __future__ import absolute_import
	from __future__ import unicode_literals

	import codecs
	from os import path
	import warnings

	try:
	from urllib.request import urlopen, Request
	except ImportError:
	try:
	from urllib2 import urlopen, Request
	except:
	pass


	PSL_URL = 'https://publicsuffix.org/list/public_suffix_list.dat'

	BASE_DIR = path.dirname(__file__)
	PSL_FILE = path.join(BASE_DIR, 'public_suffix_list.dat')



	class PublicSuffixList(object):

	def __init__(self, psl_file=None, idna=True):
	"""
	Read and parse a public suffix list. `psl_file` is either a file
	location string, or a file-like object, or an iterable of lines from a
	public suffix data file.

	If psl_file is None, the vendored file named "public_suffix_list.dat" is
	loaded. It is stored side by side with this Python package.

	The Mozilla public suffix list is no longer IDNA-encoded, it is UTF-8.
	For use cases with domains that are IDNA encoded, choose idna=True and
	the list will be converted upon loading. The wrong encoding will provide
	incorrect answers in either use case.

	The file format is described at http://publicsuffix.org/

	:param psl_file: string or None
	:param idna: boolean, whether to convert file to IDNA-encoded strings
	"""
	# Note: we test for None as we accept empty lists as inputs
	if psl_file is None or isinstance(psl_file, str):
	with codecs.open(psl_file or PSL_FILE, 'r', encoding='utf8') as psl:
	psl = psl.readlines()
	else:
	# assume file-like
	psl = psl_file

	# a list of eTLDs with their modifiers, e.g., *
	self.tlds = []
	root = self._build_structure(psl, idna)
	self.root = self._simplify(root)

	def _find_node(self, parent, parts):
	"""
	Processing each line of the public suffix list recursively to build the
	Trie. Each line is processed into a dictionary, which may contain sub-
	Trie, and nodes terminate in node of either 0 or 1 (negate).

	This method takes the current parent Trie, and searches it for the next
	part in the line (child). If not found, it adds a node to the Trie,
	creating a new branch with the [0]. If found, the existing sub-Trie is
	passed for the next part.

	:param parent: current Trie, form is Tuple (negate, dict of Trie)
	:param parts: list of strings
	:return: recursive search for remaining domain parts
	"""
	if not parts:
	return parent

	# this initiates the Trie from a new node as [negate, dict()]
	if len(parent) == 1:
	parent.append({})

	assert len(parent) == 2
	_negate, children = parent

	child = parts.pop()

	# if child already exists as a node, grab the sub-Trie
	child_node = children.get(child, None)

	# if it doesn't exist, creates a new node and initialized with [0]
	if not child_node:
	children[child] = child_node = [0]

	return self._find_node(child_node, parts)

	def _add_rule(self, root, rule):
	"""
	Initial setup for a line of the public suffix list. If it starts with !
	that is a negation operation. this calls the find_node() method
	recursively to build out the Trie for this rule.

	:param root: root Trie
	:param rule: string, line of public suffixlist
	:return: None
	"""
	if rule.startswith('!'):
	negate = 1
	rule = rule[1:]
	else:
	negate = 0

	parts = rule.split('.')
	self._find_node(root, parts)[0] = negate

	def _simplify(self, node):
	"""
	Condense the lines of the Trie in place.

	:param node: node in the Trie, either 0/1 or a subTrie
	:return: simplified Trie, form Tuple
	"""
	if len(node) == 1:
	return node[0]

	return (node[0], dict((k, self._simplify(v)) for (k, v) in node[1].items()))

	def _build_structure(self, fp, idna):
	"""
	Build a Trie from the public suffix list. If idna==True, idna-encode
	each line before building.

	The Trie is comprised of tuples that encode whether the line is a
	negation line (0 or 1), and terminate with 0. Each node is represented
	with two-tuple of the form (negate, dict of children / sub-Trie). A
	partial subTrie therefore looks like: (0, {'ac': 0, 'co': (0,
	{'blogspot': 0}), 'gv': 0,....}) where each tuple starts with the
	negation encoding, and each leaf in the Trie as a dictionary element
	returns 0.

	Also creates an instance attribute, tlds, which simply contains the
	publicsuffix list, with the modifiers such as wildcards, as a list. This
	can be accessed for post-processing by the application.

	:param fp: pointer for the public suffix list
	:param idna: boolean, convert lines to idna-encoded strings
	:return: Trie
	"""
	root = [0]

	tlds = self.tlds

	for line in fp:
	line = line.strip()
	if not line or line.startswith('//'):
	continue
	if idna:
	line = line.encode('idna').decode()
	tlds.append(line)

	self._add_rule(root, line.split()[0].lstrip('.'))

	return root

	def _lookup_node(self, matches, depth, parent, parts, wildcard):
	"""
	Traverses the Trie recursively to find the parts. By default, the
	traverse follows wildcards, as appropriate for the public suffix list,
	but if wildcard is set to False, it will stop at wildcard leaves. This
	can be useful for summarizing complex wildcard domains like those under
	amazonaws.com.

	The lookup is tracked via a list, initially set to all None, that marks
	the negation flags of nodes it matches. each match will be marked for
	later composition of the eTLD.

	:param matches: list, parts long, None (initial), 0, or 1
	:param depth: int, how far in the Trie this run is
	:param parent: Tuple, the current subTrie
	:param parts: list of domain parts, strings
	:param wildcard: boolean, whether to process wildcard nodes
	:return: None, recursive call
	"""
	if wildcard and depth == 1:
	# if no rules match, the prevailing rule is "*"
	# See: Algorithm 2 at https://publicsuffix.org/list/
	matches[-depth] = 0

	if parent in (0, 1):
	return

	children = parent[1]

	if depth <= len(parts) and children:
	for name in ('*', parts[-depth]):
	child = children.get(name, None)
	if child is not None:
	if wildcard or name != '*':
	if child in (0, 1):
	negate = child
	else:
	negate = child[0]
	matches[-depth] = negate
	self._lookup_node(matches, depth + 1, child, parts, wildcard)

	def get_sld(self, domain, wildcard=True, strict=False):
	"""
	Return the second-level-domain (SLD) or private suffix of a given domain
	according to the public suffix list. The public suffix list includes
	wildcards, so if wildcard is set to True, this will follow the wildcard
	on traversal, otherwise it will stop at wildcard nodes.

	The logic does not check by default whether the TLD is in the Trie, so
	for example, 'www.this.local' will return 'this.local'. If you want to
	ensure the TLD is in the public suffix list, use strict=True.

	If domain is already an eTLD, it returns domain as-is instead of None
	value.

	:param domain: string, needs to match the encoding of the PSL (idna or UTF8)
	:param wildcard: boolean, follow wildcard patterns
	:param strict: boolean, check the TLD is valid, return None if not
	:return: string, the SLD for the domain
	"""
	if not domain:
	return None

	# for compatibility, set strict True not to allow invalid TLDs
	tld = self.get_tld(domain, wildcard, True)
	if strict and tld is None:
	return None

	parts = domain.lower().strip('.').split('.')
	num_of_tld_parts = 0 if tld is None else tld.count('.') + 1

	if len(parts) <= num_of_tld_parts:
	return tld
	else:
	return '.'.join(parts[-(num_of_tld_parts + 1):])

	def get_public_suffix(self, domain, wildcard=True, strict=False):
	"""
	Use get_sld() instead.
	"""
	return self.get_sld(domain, wildcard, strict)

	def get_tld(self, domain, wildcard=True, strict=False):
	"""
	Return the TLD, or public suffix, of a domain using the public suffix
	list. uses wildcards if set, and checks for valid top TLD is
	strict=True.

	This will return the domain itself when it is an ICANN TLD, e.g., 'com'
	returns 'com', for follow on processing, while 'co.uk' return 'uk'. On
	the other hand, more complicated domains will return their public
	suffix, e.g.,
	'google.co.uk' will return 'co.uk'. Root ('.') will return empty string.

	:param domain: string
	:param wildcard: boolean, follow wildcards in Trie
	:param strict: boolean, check that top TLD is valid in Trie
	:return: string, the TLD for the domain
	"""
	if not domain:
	return None
	parts = domain.lower().strip('.').split('.')
	hits = [None] * len(parts)
	if strict and (
	self.root in (0, 1) or parts[-1] not in self.root[1].keys()
	):
	return None

	self._lookup_node(hits, 1, self.root, parts, wildcard)

	for i, what in enumerate(hits):
	if what is not None and what == 0:
	return '.'.join(parts[i:])


	_PSL = None


	def get_sld(domain, psl_file=None, wildcard=True, idna=True, strict=False):
	"""
	Return the private suffix or SLD for a `domain` DNS name string. The
	original publicsuffix2 library used the method get_public_suffix() for this
	purpose, but get_private_suffix() is more proper. Convenience function that
	builds and caches a PublicSuffixList object.

	Optionally read, and parse a public suffix list. `psl_file` is either a file
	location string, or a file-like object, or an iterable of lines from a
	public suffix data file.

	If psl_file is None, the vendored file named "public_suffix_list.dat" is
	loaded. It is stored side by side with this Python package.

	The file format is described at http://publicsuffix.org/
	"""
	global _PSL
	_PSL = _PSL or PublicSuffixList(psl_file, idna=idna)
	return _PSL.get_sld(domain, wildcard=wildcard, strict=strict)


	def get_tld(domain, psl_file=None, wildcard=True, idna=True, strict=False):
	"""
	Return the TLD or public suffix for a `domain` DNS name string. (this is
	actually the private suffix that is returned) Convenience function that
	builds and caches a PublicSuffixList object.

	Optionally read, and parse a public suffix list. `psl_file` is either a file
	location string, or a file-like object, or an iterable of lines from a
	public suffix data file.

	If psl_file is None, the vendored file named "public_suffix_list.dat" is
	loaded. It is stored side by side with this Python package.

	The file format is described at http://publicsuffix.org/
	"""
	global _PSL
	_PSL = _PSL or PublicSuffixList(psl_file, idna=idna)
	return _PSL.get_tld(domain, wildcard=wildcard, strict=strict)


	def get_public_suffix(domain, psl_file=None, wildcard=True, idna=True, strict=False):
	"""
	Included for compatibility with the original publicsuffix2 library -- this
	function returns the private suffix or SLD of the domain. To get the public
	suffix, use get_tld(). Convenience function that builds and caches a
	PublicSuffixList object.

	Optionally read, and parse a public suffix list. `psl_file` is either a file
	location string, or a file-like object, or an iterable of lines from a
	public suffix data file.

	If psl_file is None, the vendored file named "public_suffix_list.dat" is
	loaded. It is stored side by side with this Python package.

	The file format is described at http://publicsuffix.org/
	"""
	warnings.warn(
	'This function returns the private suffix, SLD, or registrable domain. '
	'This equivalent to function get_sld(). '
	'To get the public suffix itself, use get_tld().',
	UserWarning
	)
	return get_sld(domain, psl_file, wildcard, idna, strict)


	def fetch():
	"""
	Return a file-like object for the latest public suffix list downloaded from
	publicsuffix.org
	"""
	req = Request(PSL_URL, headers={'User-Agent': 'python-publicsuffix2'})
	res = urlopen(req)
	try:
	encoding = res.headers.get_content_charset()
	except AttributeError:
	encoding = res.headers.getparam('charset')
	f = codecs.getreader(encoding)(res)
	return f


	def update_dat():
	import os
	import time
	if not os.path.exists(PSL_FILE) or os.path.getmtime(PSL_FILE) < time.time() - 86400 * 30:
	try:
	f = fetch()
	f_dat = f.read()
	except:
	return

	if os.path.exists(PSL_FILE) and len(f_dat) < os.path.getsize(PSL_FILE) // 10:
	print('Public Suffix List is too small, not updating')

	with open(PSL_FILE, 'w') as fp:
	fp.write(f_dat)