Source code for basilica

from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import base64
import requests
import io
from PIL import Image

__version__ = '0.2.5'

[docs]class Connection(object): def __init__(self, auth_key, server='https://api.basilica.ai', retries=2, backoff_factor=0.1, status_forcelist=(500)): """A connection to basilica.ai that can be used to generate embeddings. :param auth_key: Your auth key. You can view your auth keys at https://basilica.ai/auth_keys. :type auth_key: str :param server: What URL to use to connect to the server. :type server: str :param retries: Number of times to retry failed connections and requests. :type retries: int :param backoff_factor: See urllib3.util.retry.Retry.backoff_factor . :type backoff_factor: float :param status_forcelist: What HTTP response codes trigger a retry. :type status_forcelist: Tuple[int] >>> with basilica.Connection('SLOW_DEMO_KEY') as c: ... print(c.embed_sentence('A sentence.')) [0.6246702671051025, ..., -0.03025037609040737] """ self.server = server self.session = requests.Session() self.session.auth = (auth_key, '') self.retry = Retry( total=retries, read=retries, connect=retries, backoff_factor=backoff_factor, status_forcelist=status_forcelist, ) self.adapter = HTTPAdapter(max_retries=self.retry) self.session.mount('http://', self.adapter) self.session.mount('https://', self.adapter) def __enter__(self, *a, **kw): self.session.__enter__(*a, **kw) return self def __exit__(self, *a, **kw): return self.session.__exit__(*a, **kw) def raw_embed(self, url, data, opts, timeout): if type(url) != str: raise ValueError('`url` argument must be a string (got `%s`)' % url) if type(opts) != dict: raise ValueError('`url` argument must be a dict (got `%s`)' % url) if 'data' in opts: raise ValueError('`opts` argument may not contain `data` key (got `%s`)' % opts) query = opts.copy() query['data'] = data # For some reason the requests library doesn't retry timeouts # on its own. We don't bother with backoff. for i in range(self.retry.read+1): try: headers = { 'User-Agent': 'Basilica Python Client (%s)' % __version__ } res = self.session.post(url, json=query, timeout=timeout, headers=headers) except requests.exceptions.Timeout: if i < self.retry.read: continue else: raise break res.raise_for_status() out = res.json() if 'error' in out: raise RuntimeError('basilica.ai server returned error: `%s`' % out['error']) if 'embeddings' not in out: raise RuntimeError('basilica.ai server did not return embeddings: `%s`' % out) return out['embeddings'] # TODO: parallelize def embed(self, url, data, batch_size, opts, timeout): batch = [] for i in data: batch.append(i) if len(batch) >= batch_size: for e in self.raw_embed(url, batch, opts=opts, timeout=timeout): yield e batch = [] if len(batch) > 0: for e in self.raw_embed(url, batch, opts=opts, timeout=timeout): yield e batch = []
[docs] def embed_images(self, images, model='generic', version='default', batch_size=32, opts={}, timeout=30): """Generate embeddings for JPEG images. Images should be passed as byte strings, and will be sent to the server in batches to be embedded. :param images: An iterable (such as a list) of the images to embed. :type images: Iterable[str] :param model: What model to use (i.e. the kind of image being embedded). :type model: str :param version: What version of that model to use. :type version: str :param batch_size: How many instances to send to the server at a time. :type batch_size: int :param opts: Options specific to the model/version you chose. :type opts: Dict[str, Any] :param opts["dimensions"]: Number of dimensions to return. PCA will be used to reduce the number of dimensions with minimal information loss. :type opts["dimensions"]: int :param opts["normalize_l2"]: Whether or not each instance should be scaled to have unit L2 norm. (This is sometimes useful for instance retrieval tasks.) Defaults to False. :type opts["normalize_l2"]: bool :param opts["normalize_mean"]: Whether or not to normalize each feature in the embedding to have mean 0 across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_mean"]: bool :param opts["normalize_variance"]: Whether or not to normalize each feature in the embedding to have unit variance across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_variance"]: bool :param timeout: HTTP timeout for request. :type timeout: int :returns: A generator of embeddings. :rtype: Generator[List[float]] >>> with basilica.Connection('SLOW_DEMO_KEY') as c: ... images = [] ... for filename in ['img1.jpg', 'img2.jpg']: ... with open(filename, 'rb') as f: ... images.append(f.read()) ... for embedding in c.embed_images(images): ... print(embedding) [0.6246702671051025, ...] [-0.03025037609040737, ...] """ url = '%s/embed/images/%s/%s' % (self.server, model, version) data = ({'img': self.__encode_image(img, transform_image=opts.get("transform_image", True) )} for img in images) return self.embed(url, data, batch_size=batch_size, opts=opts, timeout=timeout)
[docs] def embed_image(self, image, model='generic', version='default', opts={}, timeout=10): """Generate the embedding for a JPEG image. The image should be passed as a byte string. :param image: The image to embed. :type image: str :param model: What model to use (i.e. the kind of image being embedded). :type model: str :param version: What version of that model to use. :type version: str :param opts: Options specific to the model/version you chose. :type opts: Dict[str, Any] :param opts["dimensions"]: Number of dimensions to return. PCA will be used to reduce the number of dimensions with minimal information loss. :type opts["dimensions"]: int :param opts["normalize_l2"]: Whether or not each instance should be scaled to have unit L2 norm. (This is sometimes useful for instance retrieval tasks.) Defaults to False. :type opts["normalize_l2"]: bool :param opts["normalize_mean"]: Whether or not to normalize each feature in the embedding to have mean 0 across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_mean"]: bool :param opts["normalize_variance"]: Whether or not to normalize each feature in the embedding to have unit variance across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_variance"]: bool :param timeout: HTTP timeout for request. :type timeout: int :returns: An embedding. :rtype: List[float] >>> with basilica.Connection('SLOW_DEMO_KEY') as c: ... with open('img.jpg', 'rb') as f: ... print(c.embed_image(f.read())) [0.6246702671051025, ...] """ return list(self.embed_images([image], model=model, version=version, opts=opts, timeout=timeout))[0]
[docs] def embed_image_files(self, image_files, model='generic', version='default', batch_size=32, opts={}, timeout=30): """Generate embeddings for JPEG image files. The file names should be passed as paths that can be understood by `open`. :param image_files: An iterable (such as a list) of paths to the images to embed. :type image_files: Iterable[str] :param model: What model to use (i.e. the kind of image being embedded). :type model: str :param version: What version of that model to use. :type version: str :param batch_size: How many instances to send to the server at a time. :type batch_size: int :param opts: Options specific to the model/version you chose. :type opts: Dict[str, Any] :param opts["dimensions"]: Number of dimensions to return. PCA will be used to reduce the number of dimensions with minimal information loss. :type opts["dimensions"]: int :param opts["normalize_l2"]: Whether or not each instance should be scaled to have unit L2 norm. (This is sometimes useful for instance retrieval tasks.) Defaults to False. :type opts["normalize_l2"]: bool :param opts["normalize_mean"]: Whether or not to normalize each feature in the embedding to have mean 0 across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_mean"]: bool :param opts["normalize_variance"]: Whether or not to normalize each feature in the embedding to have unit variance across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_variance"]: bool :param timeout: HTTP timeout for request. :type timeout: int :returns: A generator of embeddings. :rtype: Generator[List[float]] >>> with basilica.Connection('SLOW_DEMO_KEY') as c: ... for embedding in c.embed_image_files(['img1.jpg', 'img2.jpg']): ... print(embedding) [0.6246702671051025, ...] [-0.03025037609040737, ...] """ def load_image_files(image_files): for image_file in image_files: with open(image_file, 'rb') as f: yield f.read() return self.embed_images(load_image_files(image_files), model=model, version=version, batch_size=batch_size, opts=opts, timeout=timeout)
[docs] def embed_image_file(self, image_file, model='generic', version='default', opts={}, timeout=10): """Generate the embedding for a JPEG image file. The file name should be passed as a path that can be understood by `open`. :param image_file: Path to the image to embed. :type image_file: str :param model: What model to use (i.e. the kind of image being embedded). :type model: str :param version: What version of that model to use. :type version: str :param opts: Options specific to the model/version you chose. :type opts: Dict[str, Any] :param opts["dimensions"]: Number of dimensions to return. PCA will be used to reduce the number of dimensions with minimal information loss. :type opts["dimensions"]: int :param opts["normalize_l2"]: Whether or not each instance should be scaled to have unit L2 norm. (This is sometimes useful for instance retrieval tasks.) Defaults to False. :type opts["normalize_l2"]: bool :param opts["normalize_mean"]: Whether or not to normalize each feature in the embedding to have mean 0 across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_mean"]: bool :param opts["normalize_variance"]: Whether or not to normalize each feature in the embedding to have unit variance across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_variance"]: bool :param timeout: HTTP timeout for request. :type timeout: int :returns: An embedding. :rtype: List[float] >>> with basilica.Connection('SLOW_DEMO_KEY') as c: ... print(c.embed_image_file('img.jpg') [0.6246702671051025, ...] """ with open(image_file, 'rb') as f: return self.embed_image(f.read(), model=model, version=version, opts=opts, timeout=timeout)
[docs] def embed_sentences(self, sentences, model='english', version='default', batch_size=64, opts={}, timeout=15): """Generate embeddings for sentences. :param sentences: An iterable (such as a list) of sentences to embed. :type sentences: Iterable[str] :param model: What model to use (i.e. the kind of sentence being embedded). * **generic:** Generic English text embedding (the default.) * **reddit:** Text embedding specialized for English Reddit posts. * **twitter:** Text embedding specialized for English tweets. * **email:** Text embedding specialized for English emails. * **product-reviews:** Text embedding specialized for English product reviews. :type model: str :param version: What version of that model to use. :type version: str :param batch_size: How many instances to send to the server at a time. :type batch_size: int :param opts: Options specific to the model/version you chose. :type opts: Dict[str, Any] :param opts["dimensions"]: Number of dimensions to return. PCA will be used to reduce the number of dimensions with minimal information loss. :type opts["dimensions"]: int :param opts["normalize_l2"]: Whether or not each instance should be scaled to have unit L2 norm. (This is sometimes useful for instance retrieval tasks.) Defaults to False. :type opts["normalize_l2"]: bool :param opts["normalize_mean"]: Whether or not to normalize each feature in the embedding to have mean 0 across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_mean"]: bool :param opts["normalize_variance"]: Whether or not to normalize each feature in the embedding to have unit variance across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_variance"]: bool :param timeout: HTTP timeout for request. :type timeout: int :returns: A generator of embeddings. :rtype: Generator[List[float]] >>> with basilica.Connection('SLOW_DEMO_KEY') as c: ... for embedding in c.embed_sentences(['Sentence one.', 'Sentence two.']): ... print(embedding) [0.6246702671051025, ...] [-0.03025037609040737, ...] """ url = '%s/embed/text/%s/%s' % (self.server, model, version) data = sentences return self.embed(url, data, batch_size=batch_size, opts=opts, timeout=timeout)
[docs] def embed_sentence(self, sentence, model='english', version='default', opts={}, timeout=5): """Generate the embedding for a sentence. :param sentence: The sentence to embed. :type sentence: str :param model: What model to use (i.e. the kind of sentence being embedded). * **generic:** Generic English text embedding (the default.) * **reddit:** Text embedding specialized for English Reddit posts. * **twitter:** Text embedding specialized for English tweets. * **email:** Text embedding specialized for English emails. * **product-reviews:** Text embedding specialized for English product reviews. :type model: str :param version: What version of that model to use. :type version: str :param opts: Options specific to the model/version you chose. :type opts: Dict[str, Any] :param opts["dimensions"]: Number of dimensions to return. PCA will be used to reduce the number of dimensions with minimal information loss. :type opts["dimensions"]: int :param opts["normalize_l2"]: Whether or not each instance should be scaled to have unit L2 norm. (This is sometimes useful for instance retrieval tasks.) Defaults to False. :type opts["normalize_l2"]: bool :param opts["normalize_mean"]: Whether or not to normalize each feature in the embedding to have mean 0 across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_mean"]: bool :param opts["normalize_variance"]: Whether or not to normalize each feature in the embedding to have unit variance across our sample dataset. Defaults to True when `dimensions` is set, or False otherwise. :type opts["normalize_variance"]: bool :param timeout: HTTP timeout for request. :type timeout: int :returns: An embedding. :rtype: List[float] >>> with basilica.Connection('SLOW_DEMO_KEY') as c: ... print(c.embed_sentence('This is a sentence.') [0.6246702671051025, ...] """ return list(self.embed_sentences([sentence], model=model, version=version, opts=opts, timeout=timeout))[0]
def __encode_image(self, image, transform_image): if type(image) != bytes: raise TypeError('`image` argument must be bytes (got `%s`)' % (type(image).__name__)) if transform_image: try: im = Image.open(io.BytesIO(image)) except OSError as e: raise TypeError('`image` argument must be an image (`%s`)' % (str(e))) im.thumbnail((512, 512)) im = im.convert("RGB") img_bytes = io.BytesIO() im.save(img_bytes, "JPEG") image = img_bytes.getvalue() return base64.b64encode(image).decode('utf-8')