SQLAlchemy Joining 2 CTE Results in Ambiguous ON Clause

Question:

I’m attempting retrieve a max version for either a published version or, if a resource has no published versions, the highest version. I’m using 3 CTE to find these values, one to get the max version that is published, a second to get the max version overall, and lastly a third to do an outer join which produces the highest published version if it exists, if not the highest version.

The issue I’m having in SQLAlchemy is attempting to join the first 2 CTE so that I can produce a single result for each parent of the versions.

The expected query looks like:

WITH highest_published AS (
    SELECT parent_id AS parent_id, MAX(subversion) AS m_version
    FROM child_version
    WHERE published AND NOT deleted
    GROUP BY parent_id
),
highest_unpublished AS (
    SELECT parent_id AS parent_id, MAX(subversion) AS m_version
    FROM child_version
    WHERE NOT deleted
    GROUP BY parent_id
),
max_versions AS (
    SELECT 
    CASE  WHEN hp.parent_id IS NOT NULL THEN hp.parent_id ELSE hu.parent_id END AS parent_id,
    CASE WHEN hp.m_version IS NOT null THEN hp.m_version ELSE hu.m_version END AS m_version
    FROM highest_unpublished AS hu
    LEFT OUTER JOIN highest_published AS hp ON hp.parent_id=hu.parent_id
)
SELECT child_version.id, child_version.parent_id FROM child_version 
JOIN max_versions ON child_version.parent_id=max_versions.parent_id 
AND child_version.subversion=max_versions.m_version
ORDER BY child_version.parent_id

This is the SA code using the ORM I would expect to produce this:

from sqlalchemy.dialects.postgresql import UUID
import uuid

class ChildVersion(db.Model):
    id = db.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    deleted = db.Column(db.Boolean, default=False, nullable=False)
    parent_id = db.Column(UUID(as_uuid=True), db.ForeignKey(Parent.id), nullable=True, index=True)
    subversion = db.Column(db.Integer, default=0, nullable=False)
    published = db.Column(db.Boolean, default=False, nullable=False)

highest_published_version = ChildVersion.query.with_entities(
    ChildVersion.parent_id.label('parent_id'),
    sa.func.max(ChildVersion.subversion).label('m_version'),
    ).filter(
        ChildVersion.published, ~ChildVersion.deleted
    ).group_by(ChildVersion.parent_id).cte(name='highest_published')

highest_unpublished_version = ChildVersion.query.with_entities(
    ChildVersion.parent_id.label('parent_id'),
    sa.func.max(ChildVersion.subversion).label('m_version'), 
    ).filter(
        ~ChildVersion.deleted
    ).group_by(ChildVersion.parent_id).cte(name='highest_unpublished')

versions = db.session.query(highest_unpublished_version).with_entities(
    sa.case(
        (highest_published_version.c.parent_id.is_not(None), highest_published_version.c.parent_id), 
        else_=highest_unpublished_version.c.parent_id).label('parent_id'),
    sa.case(
        (highest_published_version.c.m_version.is_not(None), highest_published_version.c.m_version), 
        else_=highest_unpublished_version.c.m_version).label('m_version'),
    ).join(highest_published_version, sa.and_(highest_unpublished_version.c.parent_id==highest_published_version.c.parent_id,
                highest_unpublished_version.c.m_version==highest_published_version.c.m_version),
            isouter=True
    ).cte(name='max_versions')

However I receive an error where my join is ambiguous:

Don't know how to join to <sqlalchemy.sql.selectable.CTE at 0x10b2c3ee0; highest_published>. Please use the
.select_from() method to establish an explicit left side, as well as providing an explicit ON clause if not present
already to help resolve the ambiguity.

Using the sqlalchemy.select to attempt to join the CTE results in invalid SQL.

highest_published_version = ChildVersion.query.with_entities(
    ChildVersion.parent_id.label('parent_id'),
    sa.func.max(ChildVersion.subversion).label('m_version'),
    ).filter(
        ChildVersion.published, ~ChildVersion.deleted
    ).group_by(ChildVersion.parent_id).cte(name='highest_published')

highest_unpublished_version = ChildVersion.query.with_entities(
    ChildVersion.parent_id.label('parent_id'),
    sa.func.max(ChildVersion.subversion).label('m_version'), 
    ).filter(
        ~ChildVersion.deleted
    ).group_by(ChildVersion.parent_id).cte('highest_unpublished')

versions = db.session.query(highest_unpublished_version).with_entities(
    sa.case(
        (highest_published_version.c.parent_id.is_not(None), highest_published_version.c.parent_id), 
        else_=highest_unpublished_version.c.parent_id).label('parent_id'),
    sa.case(
        (highest_published_version.c.m_version.is_not(None), highest_published_version.c.m_version), 
        else_=highest_unpublished_version.c.m_version).label('m_version'),
    ).join(sa.select(highest_published_version), sa.and_(highest_unpublished_version.c.parent_id==highest_published_version.c.parent_id,
                highest_unpublished_version.c.m_version==highest_published_version.c.m_version),
            isouter=True
    ).cte(name='max_versions')

versions_with_ids = ChildVersion.query.with_entities(
    ChildVersion.id
    ).join(
        versions, sa.and_(versions.c.parent_id==ChildVersion.parent_id,
            versions.c.m_version==ChildVersion.subversion)
    )
(psycopg2.errors.UndefinedTable) invalid reference to FROM-clause entry for table "highest_unpublished"
LINE 28: FROM highest_published) AS anon_4 ON highest_unpublished.com...
^
HINT: There is an entry for table "highest_unpublished", but it cannot be referenced from this part of the query.

[SQL: WITH highest_unpublished AS
(SELECT child_version.parent_id AS parent_id, max(child_version.subversion) AS m_version
FROM child_version
WHERE NOT child_version.deleted GROUP BY child_version.parent_id),
highest_published AS
(SELECT child_version.parent_id AS parent_id, max(child_version.subversion) AS m_version
FROM child_version
WHERE child_version.published AND NOT child_version.deleted GROUP BY
child_version.parent_id),
max_versions AS
(SELECT CASE WHEN (highest_published.parent_id IS NOT NULL) THEN highest_published.parent_id ELSE
highest_unpublished.parent_id END AS parent_id, CASE WHEN (highest_published.m_version IS NOT NULL) THEN
highest_published.m_version ELSE highest_unpublished.m_version END AS m_version
FROM highest_unpublished, highest_published LEFT OUTER JOIN (SELECT highest_published.parent_id AS parent_id,
highest_published.m_version AS m_version
FROM highest_published) AS anon_4 ON highest_unpublished.parent_id = highest_published.parent_id AND
highest_unpublished.m_version = highest_published.m_version)
SELECT child_version.id
FROM child_version JOIN max_versions ON max_versions.parent_id = child_version.parent_id AND
max_versions.m_version = child_version.subversion) AND NOT child_version.deleted GROUP BY
child_version.parent_id, child_version.id]
(Background on this error at: https://sqlalche.me/e/14/f405)

Any help would be greatly appreciated!

Asked By: BusinessFawn

||

Answers:

I created a test script that seems to work. It is hard to tell without actual data what should happening. I’m using SA directly without flask. Also I just created a stub for the Parent class. I used the new select() with SQLAlchemy version 1.4/2.0.

import sys
import uuid

from sqlalchemy import (
    create_engine,
    Integer,
    ForeignKey,
    Boolean,
)
from sqlalchemy.schema import (
    Column,
)
from sqlalchemy.sql import select, and_, func, case
from sqlalchemy.orm import declarative_base
from sqlalchemy.orm import Session
from sqlalchemy.dialects.postgresql import UUID


Base = declarative_base()


username, password, db = sys.argv[1:4]


engine = create_engine(f"postgresql+psycopg2://{username}:{password}@/{db}", echo=True)




class Parent(Base):
    __tablename__ = "parent"

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)


class ChildVersion(Base):

    __tablename__ = "child"
    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    deleted = Column(Boolean, default=False, nullable=False)
    parent_id = Column(UUID(as_uuid=True), ForeignKey(Parent.id), nullable=True, index=True)
    subversion = Column(Integer, default=0, nullable=False)
    published = Column(Boolean, default=False, nullable=False)


Base.metadata.create_all(engine)


with Session(engine) as session, session.begin():

    highest_published = select(
        ChildVersion.parent_id.label('parent_id'),
        func.max(ChildVersion.subversion).label('m_version')
    ).select_from(
        ChildVersion
    ).where(
        ChildVersion.published,
        ~ChildVersion.deleted
    ).group_by(
        ChildVersion.parent_id
    ).cte(name='highest_published')

    highest_unpublished = select(
        ChildVersion.parent_id.label('parent_id'),
        func.max(ChildVersion.subversion).label('m_version'),
    ).select_from(
        ChildVersion
    ).where(
        ~ChildVersion.deleted
    ).group_by(
        ChildVersion.parent_id
    ).cte('highest_unpublished')

    max_versions = select(
        case(
            (highest_published.c.parent_id.is_not(None), highest_published.c.parent_id),
            else_=highest_unpublished.c.parent_id).label('parent_id'),
        case(
            (highest_published.c.m_version.is_not(None), highest_published.c.m_version),
            else_=highest_unpublished.c.m_version).label('m_version'),
    ).select_from(
        highest_unpublished
    ).outerjoin(
        highest_published,
        highest_unpublished.c.parent_id==highest_published.c.parent_id,
        # Not in original SQL?
        highest_unpublished.c.m_version==highest_published.c.m_version
    ).cte(name='max_versions')

    versions_with_ids = select(
        ChildVersion.id,
        ChildVersion.parent_id
    ).select_from(
        ChildVersion
    ).join(
        max_versions, and_(
            ChildVersion.parent_id == max_versions.c.parent_id,
            ChildVersion.subversion == max_versions.c.m_version
        )
    ).order_by(ChildVersion.parent_id)

    session.execute(versions_with_ids)

Generated SQL via SQLAlchemy’s echo=True.

WITH highest_unpublished AS 
(SELECT child.parent_id AS parent_id, max(child.subversion) AS m_version 
FROM child 
WHERE NOT child.deleted GROUP BY child.parent_id), 
highest_published AS 
(SELECT child.parent_id AS parent_id, max(child.subversion) AS m_version 
FROM child 
WHERE child.published AND NOT child.deleted GROUP BY child.parent_id), 
max_versions AS 
(SELECT CASE WHEN (highest_published.parent_id IS NOT NULL) THEN highest_published.parent_id ELSE highest_unpublished.parent_id END AS parent_id, CASE WHEN (highest_published.m_version IS NOT NULL) THEN highest_published.m_version ELSE highest_unpublished.m_version END AS m_version 
FROM highest_unpublished LEFT OUTER JOIN highest_published ON highest_unpublished.parent_id = highest_published.parent_id)
 SELECT child.id, child.parent_id 
FROM child JOIN max_versions ON child.parent_id = max_versions.parent_id AND child.subversion = max_versions.m_version ORDER BY child.parent_id```
Answered By: Ian Wilson

I was able to solve this using a select_from as suggested in the error message.

highest_published_version = ChildVersion.query.with_entities(
    ChildVersion.competency_path_id.label('parent_id'),
    sa.func.max(ChildVersion.subversion).label('m_version'),
    ).filter(
        ChildVersion.published, ~ChildVersion.deleted
    ).group_by(ChildVersion.competency_path_id).cte(name='highest_published')

highest_unpublished_version = ChildVersion.query.with_entities(
    ChildVersion.competency_path_id.label('parent_id'),
    sa.func.max(ChildVersion.subversion).label('m_version'), 
    ).filter(
        ~ChildVersion.deleted
    ).group_by(ChildVersion.competency_path_id).cte('highest_unpublished')

versions = db.session.query(highest_published_version, highest_unpublished_version).select_from(highest_unpublished_version).with_entities(
    sa.case(
        (highest_published_version.c.parent_id.is_not(None), highest_published_version.c.parent_id), 
        else_=highest_unpublished_version.c.parent_id).label('parent_id'),
    sa.case(
        (highest_published_version.c.m_version.is_not(None), highest_published_version.c.m_version), 
        else_=highest_unpublished_version.c.m_version).label('m_version'),
).join(highest_published_version, sa.and_(highest_unpublished_version.c.parent_id==highest_published_version.c.parent_id,
            highest_unpublished_version.c.m_version==highest_published_version.c.m_version),
        isouter=True
).cte(name='max_versions')
Answered By: BusinessFawn