Skip to content

Models for User and DataObj

Internal API for the models Archivy uses in the backend that could be useful for writing plugins.

DataObj

Class that holds a data object (either a note or a bookmark).

[Required to pass when creating a new object]

  • type -> "note" or "bookmark"

Note: - title

Bookmark:

  • url

[Optional attrs that if passed, will be set by the class]

  • tags
  • content
  • path

[Handled by the code]

  • id
  • date

For bookmarks, Run process_bookmark_url() once you've created it.

For both types, run insert() if you want to create a new file in the db with their contents.

Source code in archivy/models.py
@attrs(kw_only=True)
class DataObj:
    """
    Class that holds a data object (either a note or a bookmark).

    Attributes:

    [Required to pass when creating a new object]

    - **type** -> "note" or "bookmark"

     **Note**:
    - title

    **Bookmark**:

    - url

    [Optional attrs that if passed, will be set by the class]

    - tags
    - content
    - path

    [Handled by the code]

    - id
    - date

    For bookmarks,
    Run `process_bookmark_url()` once you've created it.

    For both types, run `insert()` if you want to create a new file in
    the db with their contents.
    """

    __searchable__ = ["title", "content", "tags"]

    id: Optional[int] = attrib(validator=optional(instance_of(int)), default=None)
    type: str = attrib(validator=instance_of(str))
    title: str = attrib(validator=instance_of(str), default="")
    content: str = attrib(validator=instance_of(str), default="")
    tags: List[str] = attrib(validator=instance_of(list), default=[])
    url: Optional[str] = attrib(validator=optional(instance_of(str)), default=None)
    date: Optional[datetime] = attrib(
        validator=optional(instance_of(datetime)), default=None
    )
    modified_at: Optional[datetime] = attrib(
        validator=optional(instance_of(datetime)), default=None
    )
    path: str = attrib(validator=instance_of(str), default="")
    fullpath: Optional[str] = attrib(validator=optional(instance_of(str)), default=None)
    error: Optional[str] = attrib(validator=optional(instance_of(str)), default=None)

    def process_bookmark_url(self, raw_html=None):
        """Process url to get content for bookmark"""
        if self.type not in ("bookmark", "pocket_bookmark") or not validators.url(
            self.url
        ):
            return None
        selector = None
        for pattern, handler in current_app.config["SCRAPING_PATTERNS"].items():
            if fnmatch.fnmatch(self.url, pattern):
                if type(handler) == str:
                    # if the handler is a string, it's simply a css selector to process the page with
                    selector = handler
                    break
                # otherwise custom user function that overrides archivy behavior
                handler(self)
                return

        try:
            page_html = (
                raw_html
                or requests.get(
                    self.url,
                    headers={"User-agent": f"Archivy/v{require('archivy')[0].version}"},
                ).text
            )
        except Exception:
            self.error = f"Could not retrieve {self.url}\n"
            self.wipe()
            return

        try:
            document = Document(page_html)
            self.title = document.short_title() or self.url
            parsed_html = BeautifulSoup(document.summary(), features="html.parser")
        except Exception:
            self.error = f"Could not parse {self.url}\n"
            self.wipe()
            return

        try:
            self.content = self.extract_content(parsed_html, selector)
        except Exception:
            self.error = f"Could not extract content from {self.url}\n"
            return

    def wipe(self):
        """Resets and invalidates dataobj"""
        self.title = ""
        self.content = ""

    def extract_content(self, beautsoup, selector=None):
        """converts html bookmark url to optimized markdown and saves images"""

        url = self.url.rstrip("/")

        if selector:
            selected_soup = beautsoup.select(selector)
            # if the custom selector matched, take the first occurrence
            if selected_soup:
                beautsoup = selected_soup[0]
        resources = beautsoup.find_all(["a", "img"])
        for tag in resources:
            if tag.name == "a":
                if tag.has_attr("href") and (tag["href"].startswith("/")):
                    tag["href"] = urljoin(url, tag["href"])

                # check it's a normal link and not some sort of image
                # string returns the text content of the tag
                if not tag.string:
                    # delete tag
                    tag.decompose()

            elif tag.name == "img" and tag.has_attr("src"):
                filename = tag["src"].split("/")[-1]
                try:
                    filename = filename[
                        : filename.index("?")
                    ]  # remove query parameters
                except ValueError:
                    pass
                if not tag["src"].startswith("http"):
                    tag["src"] = urljoin(url, tag["src"])
                if current_app.config["SCRAPING_CONF"][
                    "save_images"
                ] and valid_image_filename(filename):
                    image = FileStorage(
                        BytesIO(requests.get(tag["src"]).content), filename, name="file"
                    )
                    saved_to = save_image(image)
                    tag["src"] = "/images/" + saved_to

        res = html2text(str(beautsoup), bodywidth=0)
        return res

    def validate(self):
        """Verifies that the content matches required validation constraints"""
        valid_url = (self.type != "bookmark" or self.type != "pocket_bookmark") or (
            isinstance(self.url, str) and validators.url(self.url)
        )

        valid_title = isinstance(self.title, str) and self.title != ""
        valid_content = self.type not in ("bookmark", "pocket_bookmark") or isinstance(
            self.content, str
        )
        return valid_url and valid_title and valid_content

    def insert(self):
        """Creates a new file with the object's attributes"""
        if self.validate():
            for tag in self.tags:
                add_tag_to_index(tag)
            helpers.set_max_id(helpers.get_max_id() + 1)
            self.id = helpers.get_max_id()
            self.date = datetime.now()

            hooks = current_app.config["HOOKS"]

            hooks.before_dataobj_create(self)
            data = {
                "type": self.type,
                "title": str(self.title),
                "date": self.date.strftime("%x").replace("/", "-"),
                "modified_at": self.date.strftime("%x %H:%M"),
                "tags": self.tags,
                "id": self.id,
                "path": self.path,
            }
            if self.type == "bookmark" or self.type == "pocket_bookmark":
                data["url"] = self.url

            # convert to markdown file
            dataobj = frontmatter.Post(self.content)
            dataobj.metadata = data
            self.fullpath = str(
                create(
                    frontmatter.dumps(dataobj),
                    f"{self.id}-{dataobj['title']}",
                    path=self.path,
                )
            )

            hooks.on_dataobj_create(self)
            self.index()
            return self.id
        return False

    def index(self):
        return add_to_index(self)

    @classmethod
    def from_md(cls, md_content: str):
        """
        Class method to generate new dataobj from a well formatted markdown string

        Call like this:

        ```python
        Dataobj.from_md(content)

        ```
        """
        data = frontmatter.loads(md_content)
        dataobj = {}
        dataobj["content"] = data.content
        for pair in ["id", "title", "path", "tags"]:
            try:
                dataobj[pair] = data[pair]
            except KeyError:
                # files sometimes get moved temporarily by applications while you edit
                # this can create bugs where the data is not loaded correctly
                # this handles that scenario as validation will simply fail and the event will
                # be ignored
                break
        dataobj["date"] = datetime.strptime(
            data.get("date", "01/01/70").replace("-", "/"), "%x"
        )
        dataobj["modified_at"] = datetime.strptime(
            data.get("modified_at", "01/01/1970 00:00"), "%x %H:%M"
        )
        dataobj["type"] = "processed-dataobj"
        return cls(**dataobj)

extract_content(self, beautsoup, selector=None)

converts html bookmark url to optimized markdown and saves images

Source code in archivy/models.py
def extract_content(self, beautsoup, selector=None):
    """converts html bookmark url to optimized markdown and saves images"""

    url = self.url.rstrip("/")

    if selector:
        selected_soup = beautsoup.select(selector)
        # if the custom selector matched, take the first occurrence
        if selected_soup:
            beautsoup = selected_soup[0]
    resources = beautsoup.find_all(["a", "img"])
    for tag in resources:
        if tag.name == "a":
            if tag.has_attr("href") and (tag["href"].startswith("/")):
                tag["href"] = urljoin(url, tag["href"])

            # check it's a normal link and not some sort of image
            # string returns the text content of the tag
            if not tag.string:
                # delete tag
                tag.decompose()

        elif tag.name == "img" and tag.has_attr("src"):
            filename = tag["src"].split("/")[-1]
            try:
                filename = filename[
                    : filename.index("?")
                ]  # remove query parameters
            except ValueError:
                pass
            if not tag["src"].startswith("http"):
                tag["src"] = urljoin(url, tag["src"])
            if current_app.config["SCRAPING_CONF"][
                "save_images"
            ] and valid_image_filename(filename):
                image = FileStorage(
                    BytesIO(requests.get(tag["src"]).content), filename, name="file"
                )
                saved_to = save_image(image)
                tag["src"] = "/images/" + saved_to

    res = html2text(str(beautsoup), bodywidth=0)
    return res

from_md(md_content) classmethod

Class method to generate new dataobj from a well formatted markdown string

Call like this:

Dataobj.from_md(content)
Source code in archivy/models.py
@classmethod
def from_md(cls, md_content: str):
    """
    Class method to generate new dataobj from a well formatted markdown string

    Call like this:

    ```python
    Dataobj.from_md(content)

    ```
    """
    data = frontmatter.loads(md_content)
    dataobj = {}
    dataobj["content"] = data.content
    for pair in ["id", "title", "path", "tags"]:
        try:
            dataobj[pair] = data[pair]
        except KeyError:
            # files sometimes get moved temporarily by applications while you edit
            # this can create bugs where the data is not loaded correctly
            # this handles that scenario as validation will simply fail and the event will
            # be ignored
            break
    dataobj["date"] = datetime.strptime(
        data.get("date", "01/01/70").replace("-", "/"), "%x"
    )
    dataobj["modified_at"] = datetime.strptime(
        data.get("modified_at", "01/01/1970 00:00"), "%x %H:%M"
    )
    dataobj["type"] = "processed-dataobj"
    return cls(**dataobj)

insert(self)

Creates a new file with the object's attributes

Source code in archivy/models.py
def insert(self):
    """Creates a new file with the object's attributes"""
    if self.validate():
        for tag in self.tags:
            add_tag_to_index(tag)
        helpers.set_max_id(helpers.get_max_id() + 1)
        self.id = helpers.get_max_id()
        self.date = datetime.now()

        hooks = current_app.config["HOOKS"]

        hooks.before_dataobj_create(self)
        data = {
            "type": self.type,
            "title": str(self.title),
            "date": self.date.strftime("%x").replace("/", "-"),
            "modified_at": self.date.strftime("%x %H:%M"),
            "tags": self.tags,
            "id": self.id,
            "path": self.path,
        }
        if self.type == "bookmark" or self.type == "pocket_bookmark":
            data["url"] = self.url

        # convert to markdown file
        dataobj = frontmatter.Post(self.content)
        dataobj.metadata = data
        self.fullpath = str(
            create(
                frontmatter.dumps(dataobj),
                f"{self.id}-{dataobj['title']}",
                path=self.path,
            )
        )

        hooks.on_dataobj_create(self)
        self.index()
        return self.id
    return False

process_bookmark_url(self, raw_html=None)

Process url to get content for bookmark

Source code in archivy/models.py
def process_bookmark_url(self, raw_html=None):
    """Process url to get content for bookmark"""
    if self.type not in ("bookmark", "pocket_bookmark") or not validators.url(
        self.url
    ):
        return None
    selector = None
    for pattern, handler in current_app.config["SCRAPING_PATTERNS"].items():
        if fnmatch.fnmatch(self.url, pattern):
            if type(handler) == str:
                # if the handler is a string, it's simply a css selector to process the page with
                selector = handler
                break
            # otherwise custom user function that overrides archivy behavior
            handler(self)
            return

    try:
        page_html = (
            raw_html
            or requests.get(
                self.url,
                headers={"User-agent": f"Archivy/v{require('archivy')[0].version}"},
            ).text
        )
    except Exception:
        self.error = f"Could not retrieve {self.url}\n"
        self.wipe()
        return

    try:
        document = Document(page_html)
        self.title = document.short_title() or self.url
        parsed_html = BeautifulSoup(document.summary(), features="html.parser")
    except Exception:
        self.error = f"Could not parse {self.url}\n"
        self.wipe()
        return

    try:
        self.content = self.extract_content(parsed_html, selector)
    except Exception:
        self.error = f"Could not extract content from {self.url}\n"
        return

validate(self)

Verifies that the content matches required validation constraints

Source code in archivy/models.py
def validate(self):
    """Verifies that the content matches required validation constraints"""
    valid_url = (self.type != "bookmark" or self.type != "pocket_bookmark") or (
        isinstance(self.url, str) and validators.url(self.url)
    )

    valid_title = isinstance(self.title, str) and self.title != ""
    valid_content = self.type not in ("bookmark", "pocket_bookmark") or isinstance(
        self.content, str
    )
    return valid_url and valid_title and valid_content

wipe(self)

Resets and invalidates dataobj

Source code in archivy/models.py
def wipe(self):
    """Resets and invalidates dataobj"""
    self.title = ""
    self.content = ""

User (UserMixin)

Model we use for User that inherits from flask login's UserMixin

  • username
  • password
  • is_admin
Source code in archivy/models.py
@attrs(kw_only=True)
class User(UserMixin):
    """
    Model we use for User that inherits from flask login's
    [`UserMixin`](https://flask-login.readthedocs.io/en/latest/#flask_login.UserMixin)

    Attributes:

    - **username**
    - **password**
    - **is_admin**
    """

    username: str = attrib(validator=instance_of(str))
    password: Optional[str] = attrib(validator=optional(instance_of(str)), default=None)
    is_admin: Optional[bool] = attrib(
        validator=optional(instance_of(bool)), default=None
    )
    id: Optional[int] = attrib(validator=optional(instance_of(int)), default=False)

    def insert(self):
        """Inserts the model from the database"""
        if not self.password:
            return False

        hashed_password = generate_password_hash(self.password)
        db = helpers.get_db()

        if db.search((Query().type == "user") & (Query().username == self.username)):
            return False
        db_user = {
            "username": self.username,
            "hashed_password": hashed_password,
            "is_admin": self.is_admin,
            "type": "user",
        }

        current_app.config["HOOKS"].on_user_create(self)
        return db.insert(db_user)

    @classmethod
    def from_db(cls, db_object):
        """Takes a database object and turns it into a user"""
        username = db_object["username"]
        id = db_object.doc_id

        return cls(username=username, id=id)

from_db(db_object) classmethod

Takes a database object and turns it into a user

Source code in archivy/models.py
@classmethod
def from_db(cls, db_object):
    """Takes a database object and turns it into a user"""
    username = db_object["username"]
    id = db_object.doc_id

    return cls(username=username, id=id)

insert(self)

Inserts the model from the database

Source code in archivy/models.py
def insert(self):
    """Inserts the model from the database"""
    if not self.password:
        return False

    hashed_password = generate_password_hash(self.password)
    db = helpers.get_db()

    if db.search((Query().type == "user") & (Query().username == self.username)):
        return False
    db_user = {
        "username": self.username,
        "hashed_password": hashed_password,
        "is_admin": self.is_admin,
        "type": "user",
    }

    current_app.config["HOOKS"].on_user_create(self)
    return db.insert(db_user)