Skip to content

Archivy

Models for User and DataObj

Models for User and DataObj

Internal API for the models Archivy uses in the backend that could be useful for writing plugins.

`DataObj`

Class that holds a data object (either a note or a bookmark).

[Required to pass when creating a new object]

type -> "note" or "bookmark"

Note: - title

Bookmark:

url

[Optional attrs that if passed, will be set by the class]

tags
content
path

[Handled by the code]

id
date

For bookmarks, Run process_bookmark_url() once you've created it.

For both types, run insert() if you want to create a new file in the db with their contents.

Source code in archivy/models.py

@attrs(kw_only=True)
class DataObj:
    """
    Class that holds a data object (either a note or a bookmark).

    Attributes:

    [Required to pass when creating a new object]

    - **type** -> "note" or "bookmark"

     **Note**:
    - title

    **Bookmark**:

    - url

    [Optional attrs that if passed, will be set by the class]

    - tags
    - content
    - path

    [Handled by the code]

    - id
    - date

    For bookmarks,
    Run `process_bookmark_url()` once you've created it.

    For both types, run `insert()` if you want to create a new file in
    the db with their contents.
    """

    __searchable__ = ["title", "content", "tags"]

    id: Optional[int] = attrib(validator=optional(instance_of(int)), default=None)
    type: str = attrib(validator=instance_of(str))
    title: str = attrib(validator=instance_of(str), default="")
    content: str = attrib(validator=instance_of(str), default="")
    tags: List[str] = attrib(validator=instance_of(list), default=[])
    url: Optional[str] = attrib(validator=optional(instance_of(str)), default=None)
    date: Optional[datetime] = attrib(
        validator=optional(instance_of(datetime)), default=None
    )
    modified_at: Optional[datetime] = attrib(
        validator=optional(instance_of(datetime)), default=None
    )
    path: str = attrib(validator=instance_of(str), default="")
    fullpath: Optional[str] = attrib(validator=optional(instance_of(str)), default=None)
    error: Optional[str] = attrib(validator=optional(instance_of(str)), default=None)

    def process_bookmark_url(self, raw_html=None):
        """Process url to get content for bookmark"""
        if self.type not in ("bookmark", "pocket_bookmark") or not validators.url(
            self.url
        ):
            return None
        selector = None
        for pattern, handler in current_app.config["SCRAPING_PATTERNS"].items():
            if fnmatch.fnmatch(self.url, pattern):
                if type(handler) == str:
                    # if the handler is a string, it's simply a css selector to process the page with
                    selector = handler
                    break
                # otherwise custom user function that overrides archivy behavior
                handler(self)
                return

        try:
            page_html = (
                raw_html
                or requests.get(
                    self.url,
                    headers={"User-agent": f"Archivy/v{require('archivy')[0].version}"},
                ).text
            )
        except Exception:
            self.error = f"Could not retrieve {self.url}\n"
            self.wipe()
            return

        try:
            document = Document(page_html)
            self.title = document.short_title() or self.url
            parsed_html = BeautifulSoup(document.summary(), features="html.parser")
        except Exception:
            self.error = f"Could not parse {self.url}\n"
            self.wipe()
            return

        try:
            self.content = self.extract_content(parsed_html, selector)
        except Exception:
            self.error = f"Could not extract content from {self.url}\n"
            return

    def wipe(self):
        """Resets and invalidates dataobj"""
        self.title = ""
        self.content = ""

    def extract_content(self, beautsoup, selector=None):
        """converts html bookmark url to optimized markdown and saves images"""

        url = self.url.rstrip("/")

        if selector:
            selected_soup = beautsoup.select(selector)
            # if the custom selector matched, take the first occurrence
            if selected_soup:
                beautsoup = selected_soup[0]
        resources = beautsoup.find_all(["a", "img"])
        for tag in resources:
            if tag.name == "a":
                if tag.has_attr("href") and (tag["href"].startswith("/")):
                    tag["href"] = urljoin(url, tag["href"])

                # check it's a normal link and not some sort of image
                # string returns the text content of the tag
                if not tag.string:
                    # delete tag
                    tag.decompose()

            elif tag.name == "img" and tag.has_attr("src"):
                filename = tag["src"].split("/")[-1]
                try:
                    filename = filename[
                        : filename.index("?")
                    ]  # remove query parameters
                except ValueError:
                    pass
                if not tag["src"].startswith("http"):
                    tag["src"] = urljoin(url, tag["src"])
                if current_app.config["SCRAPING_CONF"][
                    "save_images"
                ] and valid_image_filename(filename):
                    image = FileStorage(
                        BytesIO(requests.get(tag["src"]).content), filename, name="file"
                    )
                    saved_to = save_image(image)
                    tag["src"] = "/images/" + saved_to

        res = html2text(str(beautsoup), bodywidth=0)
        return res

    def validate(self):
        """Verifies that the content matches required validation constraints"""
        valid_url = (self.type != "bookmark" or self.type != "pocket_bookmark") or (
            isinstance(self.url, str) and validators.url(self.url)
        )

        valid_title = isinstance(self.title, str) and self.title != ""
        valid_content = self.type not in ("bookmark", "pocket_bookmark") or isinstance(
            self.content, str
        )
        return valid_url and valid_title and valid_content

    def insert(self):
        """Creates a new file with the object's attributes"""
        if self.validate():
            for tag in self.tags:
                add_tag_to_index(tag)
            helpers.set_max_id(helpers.get_max_id() + 1)
            self.id = helpers.get_max_id()
            self.date = datetime.now()

            hooks = current_app.config["HOOKS"]

            hooks.before_dataobj_create(self)
            data = {
                "type": self.type,
                "title": str(self.title),
                "date": self.date.strftime("%x").replace("/", "-"),
                "modified_at": self.date.strftime("%x %H:%M"),
                "tags": self.tags,
                "id": self.id,
                "path": self.path,
            }
            if self.type == "bookmark" or self.type == "pocket_bookmark":
                data["url"] = self.url

            # convert to markdown file
            dataobj = frontmatter.Post(self.content)
            dataobj.metadata = data
            self.fullpath = str(
                create(
                    frontmatter.dumps(dataobj),
                    f"{self.id}-{dataobj['title']}",
                    path=self.path,
                )
            )

            hooks.on_dataobj_create(self)
            self.index()
            return self.id
        return False

    def index(self):
        return add_to_index(self)

    @classmethod
    def from_md(cls, md_content: str):
        """
        Class method to generate new dataobj from a well formatted markdown string

        Call like this:

        ```python
        Dataobj.from_md(content)

        ```
        """
        data = frontmatter.loads(md_content)
        dataobj = {}
        dataobj["content"] = data.content
        for pair in ["id", "title", "path", "tags"]:
            try:
                dataobj[pair] = data[pair]
            except KeyError:
                # files sometimes get moved temporarily by applications while you edit
                # this can create bugs where the data is not loaded correctly
                # this handles that scenario as validation will simply fail and the event will
                # be ignored
                break
        dataobj["date"] = datetime.strptime(
            data.get("date", "01/01/70").replace("-", "/"), "%x"
        )
        dataobj["modified_at"] = datetime.strptime(
            data.get("modified_at", "01/01/1970 00:00"), "%x %H:%M"
        )
        dataobj["type"] = "processed-dataobj"
        return cls(**dataobj)

`extract_content(self, beautsoup, selector=None)`

converts html bookmark url to optimized markdown and saves images

Source code in archivy/models.py

def extract_content(self, beautsoup, selector=None):
    """converts html bookmark url to optimized markdown and saves images"""

    url = self.url.rstrip("/")

    if selector:
        selected_soup = beautsoup.select(selector)
        # if the custom selector matched, take the first occurrence
        if selected_soup:
            beautsoup = selected_soup[0]
    resources = beautsoup.find_all(["a", "img"])
    for tag in resources:
        if tag.name == "a":
            if tag.has_attr("href") and (tag["href"].startswith("/")):
                tag["href"] = urljoin(url, tag["href"])

            # check it's a normal link and not some sort of image
            # string returns the text content of the tag
            if not tag.string:
                # delete tag
                tag.decompose()

        elif tag.name == "img" and tag.has_attr("src"):
            filename = tag["src"].split("/")[-1]
            try:
                filename = filename[
                    : filename.index("?")
                ]  # remove query parameters
            except ValueError:
                pass
            if not tag["src"].startswith("http"):
                tag["src"] = urljoin(url, tag["src"])
            if current_app.config["SCRAPING_CONF"][
                "save_images"
            ] and valid_image_filename(filename):
                image = FileStorage(
                    BytesIO(requests.get(tag["src"]).content), filename, name="file"
                )
                saved_to = save_image(image)
                tag["src"] = "/images/" + saved_to

    res = html2text(str(beautsoup), bodywidth=0)
    return res

`from_md(md_content)` `classmethod`

Class method to generate new dataobj from a well formatted markdown string

Call like this:

Dataobj.from_md(content)

Source code in archivy/models.py

@classmethod
def from_md(cls, md_content: str):
    """
    Class method to generate new dataobj from a well formatted markdown string

    Call like this:

    ```python
    Dataobj.from_md(content)

    ```
    """
    data = frontmatter.loads(md_content)
    dataobj = {}
    dataobj["content"] = data.content
    for pair in ["id", "title", "path", "tags"]:
        try:
            dataobj[pair] = data[pair]
        except KeyError:
            # files sometimes get moved temporarily by applications while you edit
            # this can create bugs where the data is not loaded correctly
            # this handles that scenario as validation will simply fail and the event will
            # be ignored
            break
    dataobj["date"] = datetime.strptime(
        data.get("date", "01/01/70").replace("-", "/"), "%x"
    )
    dataobj["modified_at"] = datetime.strptime(
        data.get("modified_at", "01/01/1970 00:00"), "%x %H:%M"
    )
    dataobj["type"] = "processed-dataobj"
    return cls(**dataobj)

`insert(self)`

Creates a new file with the object's attributes

Source code in archivy/models.py

def insert(self):
    """Creates a new file with the object's attributes"""
    if self.validate():
        for tag in self.tags:
            add_tag_to_index(tag)
        helpers.set_max_id(helpers.get_max_id() + 1)
        self.id = helpers.get_max_id()
        self.date = datetime.now()

        hooks = current_app.config["HOOKS"]

        hooks.before_dataobj_create(self)
        data = {
            "type": self.type,
            "title": str(self.title),
            "date": self.date.strftime("%x").replace("/", "-"),
            "modified_at": self.date.strftime("%x %H:%M"),
            "tags": self.tags,
            "id": self.id,
            "path": self.path,
        }
        if self.type == "bookmark" or self.type == "pocket_bookmark":
            data["url"] = self.url

        # convert to markdown file
        dataobj = frontmatter.Post(self.content)
        dataobj.metadata = data
        self.fullpath = str(
            create(
                frontmatter.dumps(dataobj),
                f"{self.id}-{dataobj['title']}",
                path=self.path,
            )
        )

        hooks.on_dataobj_create(self)
        self.index()
        return self.id
    return False

`process_bookmark_url(self, raw_html=None)`

Process url to get content for bookmark

Source code in archivy/models.py

def process_bookmark_url(self, raw_html=None):
    """Process url to get content for bookmark"""
    if self.type not in ("bookmark", "pocket_bookmark") or not validators.url(
        self.url
    ):
        return None
    selector = None
    for pattern, handler in current_app.config["SCRAPING_PATTERNS"].items():
        if fnmatch.fnmatch(self.url, pattern):
            if type(handler) == str:
                # if the handler is a string, it's simply a css selector to process the page with
                selector = handler
                break
            # otherwise custom user function that overrides archivy behavior
            handler(self)
            return

    try:
        page_html = (
            raw_html
            or requests.get(
                self.url,
                headers={"User-agent": f"Archivy/v{require('archivy')[0].version}"},
            ).text
        )
    except Exception:
        self.error = f"Could not retrieve {self.url}\n"
        self.wipe()
        return

    try:
        document = Document(page_html)
        self.title = document.short_title() or self.url
        parsed_html = BeautifulSoup(document.summary(), features="html.parser")
    except Exception:
        self.error = f"Could not parse {self.url}\n"
        self.wipe()
        return

    try:
        self.content = self.extract_content(parsed_html, selector)
    except Exception:
        self.error = f"Could not extract content from {self.url}\n"
        return

`validate(self)`

Verifies that the content matches required validation constraints

Source code in archivy/models.py

def validate(self):
    """Verifies that the content matches required validation constraints"""
    valid_url = (self.type != "bookmark" or self.type != "pocket_bookmark") or (
        isinstance(self.url, str) and validators.url(self.url)
    )

    valid_title = isinstance(self.title, str) and self.title != ""
    valid_content = self.type not in ("bookmark", "pocket_bookmark") or isinstance(
        self.content, str
    )
    return valid_url and valid_title and valid_content

`wipe(self)`

Resets and invalidates dataobj

Source code in archivy/models.py

def wipe(self):
    """Resets and invalidates dataobj"""
    self.title = ""
    self.content = ""

`User (UserMixin)`

Model we use for User that inherits from flask login's UserMixin

username
password
is_admin

Source code in archivy/models.py

@attrs(kw_only=True)
class User(UserMixin):
    """
    Model we use for User that inherits from flask login's
    [`UserMixin`](https://flask-login.readthedocs.io/en/latest/#flask_login.UserMixin)

    Attributes:

    - **username**
    - **password**
    - **is_admin**
    """

    username: str = attrib(validator=instance_of(str))
    password: Optional[str] = attrib(validator=optional(instance_of(str)), default=None)
    is_admin: Optional[bool] = attrib(
        validator=optional(instance_of(bool)), default=None
    )
    id: Optional[int] = attrib(validator=optional(instance_of(int)), default=False)

    def insert(self):
        """Inserts the model from the database"""
        if not self.password:
            return False

        hashed_password = generate_password_hash(self.password)
        db = helpers.get_db()

        if db.search((Query().type == "user") & (Query().username == self.username)):
            return False
        db_user = {
            "username": self.username,
            "hashed_password": hashed_password,
            "is_admin": self.is_admin,
            "type": "user",
        }

        current_app.config["HOOKS"].on_user_create(self)
        return db.insert(db_user)

    @classmethod
    def from_db(cls, db_object):
        """Takes a database object and turns it into a user"""
        username = db_object["username"]
        id = db_object.doc_id

        return cls(username=username, id=id)

`from_db(db_object)` `classmethod`

Takes a database object and turns it into a user

Source code in archivy/models.py

@classmethod
def from_db(cls, db_object):
    """Takes a database object and turns it into a user"""
    username = db_object["username"]
    id = db_object.doc_id

    return cls(username=username, id=id)

`insert(self)`

Inserts the model from the database

Source code in archivy/models.py

def insert(self):
    """Inserts the model from the database"""
    if not self.password:
        return False

    hashed_password = generate_password_hash(self.password)
    db = helpers.get_db()

    if db.search((Query().type == "user") & (Query().username == self.username)):
        return False
    db_user = {
        "username": self.username,
        "hashed_password": hashed_password,
        "is_admin": self.is_admin,
        "type": "user",
    }

    current_app.config["HOOKS"].on_user_create(self)
    return db.insert(db_user)