File size: 4,927 Bytes
e4f9cbe
 
 
 
dbed4d4
fb71af1
b708786
e4f9cbe
 
544327d
e4f9cbe
 
55dc3dd
e4f9cbe
 
cc5eabb
0dc939d
e4f9cbe
 
54369d2
e4f9cbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
544327d
 
 
 
 
 
 
e4f9cbe
544327d
e4f9cbe
8796ec1
b708786
 
86be2dc
6a8124e
 
b4ce410
 
81f7253
 
 
 
2c44166
81f7253
 
 
 
 
3a11016
81f7253
 
cc5eabb
 
 
81f7253
cc5eabb
 
81f7253
 
 
8e61415
 
81f7253
 
 
cc5eabb
 
93e1117
cc5eabb
 
 
 
 
 
 
fb71af1
 
 
 
 
7b46386
e4f9cbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""Serves the Lilac server."""

import logging
import os
import shutil
import subprocess
from typing import Any

from fastapi import APIRouter, FastAPI
from fastapi.responses import FileResponse, ORJSONResponse
from fastapi.routing import APIRoute
from fastapi.staticfiles import StaticFiles
from huggingface_hub import snapshot_download

from . import router_concept, router_data_loader, router_dataset, router_signal, router_tasks
from .concepts.db_concept import DiskConceptDB, get_concept_output_dir
from .config import CONFIG, data_path
from .router_utils import RouteErrorHandler
from .tasks import task_manager
from .utils import get_dataset_output_dir, list_datasets

DIST_PATH = os.path.abspath(os.path.join('web', 'blueprint', 'build'))

tags_metadata: list[dict[str, Any]] = [{
  'name': 'datasets',
  'description': 'API for querying a dataset.',
}, {
  'name': 'concepts',
  'description': 'API for managing concepts.',
}, {
  'name': 'data_loaders',
  'description': 'API for loading data.',
}, {
  'name': 'signals',
  'description': 'API for managing signals.',
}]


def custom_generate_unique_id(route: APIRoute) -> str:
  """Generate the name for the API endpoint."""
  return route.name


app = FastAPI(
  default_response_class=ORJSONResponse,
  generate_unique_id_function=custom_generate_unique_id,
  openapi_tags=tags_metadata)

v1_router = APIRouter(route_class=RouteErrorHandler)
v1_router.include_router(router_dataset.router, prefix='/datasets', tags=['datasets'])
v1_router.include_router(router_concept.router, prefix='/concepts', tags=['concepts'])
v1_router.include_router(router_data_loader.router, prefix='/data_loaders', tags=['data_loaders'])
v1_router.include_router(router_signal.router, prefix='/signals', tags=['signals'])
v1_router.include_router(router_tasks.router, prefix='/tasks', tags=['tasks'])

app.include_router(v1_router, prefix='/api/v1')


@app.api_route('/{path_name}', include_in_schema=False)
def catch_all() -> FileResponse:
  """Catch any other requests and serve index for HTML5 history."""
  return FileResponse(path=os.path.join(DIST_PATH, 'index.html'))


# Serve static files in production mode.
app.mount('/', StaticFiles(directory=DIST_PATH, html=True, check_dir=False))


@app.on_event('startup')
def startup() -> None:
  """Download dataset files from the HF space that was uploaded before building the image."""
  print('env=', CONFIG)

  # SPACE_ID is the HuggingFace Space ID environment variable that is automatically set by HF.
  repo_id = CONFIG.get('SPACE_ID', None)

  if repo_id:
    # Download the huggingface space data. This includes code and datasets, so we move the datasets
    # alone to the data directory.
    spaces_download_dir = os.path.join(data_path(), '.hf_spaces', repo_id)
    snapshot_download(
      repo_id=repo_id,
      repo_type='space',
      local_dir=spaces_download_dir,
      local_dir_use_symlinks=False,
      cache_dir=os.path.join(data_path(), '.hf_cache'),
      token=CONFIG['HF_ACCESS_TOKEN'])

    # Copy datasets.
    spaces_data_dir = os.path.join(spaces_download_dir, 'data')
    datasets = list_datasets(spaces_data_dir)
    for dataset in datasets:
      spaces_dataset_output_dir = get_dataset_output_dir(spaces_data_dir, dataset.namespace,
                                                         dataset.dataset_name)
      persistent_output_dir = get_dataset_output_dir(data_path(), dataset.namespace,
                                                     dataset.dataset_name)

      # Huggingface doesn't let you selectively download files so we just copy the data directory
      # out of the cloned space.
      shutil.rmtree(persistent_output_dir, ignore_errors=True)
      shutil.move(spaces_dataset_output_dir, persistent_output_dir)

    # Copy concepts.
    concepts = DiskConceptDB(spaces_data_dir).list()
    print('loading concepts', concepts, spaces_data_dir)
    for concept in concepts:
      spaces_concept_output_dir = get_concept_output_dir(spaces_data_dir, concept.namespace,
                                                         concept.name)
      persistent_output_dir = get_dataset_output_dir(data_path(), concept.namespace, concept.name)
      shutil.rmtree(persistent_output_dir, ignore_errors=True)
      shutil.move(spaces_concept_output_dir, persistent_output_dir)


def run(cmd: str) -> subprocess.CompletedProcess[bytes]:
  """Run a command and return the result."""
  return subprocess.run(cmd, shell=True, check=True)


@app.on_event('shutdown')
async def shutdown_event() -> None:
  """Kill the task manager when FastAPI shuts down."""
  await task_manager().stop()


class GetTasksFilter(logging.Filter):
  """Task filter for /tasks."""

  def filter(self, record: logging.LogRecord) -> bool:
    """Filters out /api/v1/tasks/ from the logs."""
    return record.getMessage().find('/api/v1/tasks/') == -1


logging.getLogger('uvicorn.access').addFilter(GetTasksFilter())