UDF to get an aggregated average split by multiple categories across nested documents

Hello Folks,

I’m new to FaunaDB. I’m using it as a back end for a simple Angular application that collects survey responses from users. It’s been tremendously easy to work with so far. I had my GraphQL API up and running in a few minutes!

However, I’m now at a point were I need to do a little bit more. Specifically I want to return some summarised scores for the survey to be used in charts and graphs. I’ve been trying to achieve this with a UDF. I’ve followed along the online tutorials (how to query nested documents and how to aggregate data). I’ve found the FQL language quiet verbose and the online tutorials to be far too simple to help me so I’m turning here for help.

My schema is below where its made up of:

  • BenchmarkResponse: Top level document
    → BenchmarkResponsesOrgDetail: information about the organization
    → benchmarkResponsesProgramDetail: information about the organization data function
    → BenchmarkResponsesScore: a key value pairing of the users response for each of 24 questions included in the survey. The answer can be null or a number between 1 and 6.

The schema is shown below where most fields have their own GraphQL type that are all just enums.

Schema:

type BenchmarkResponse {
  benchmarkResponsesOrgDetails: BenchmarkResponsesOrgDetail
  benchmarkResponsesProgramDetails: BenchmarkResponsesProgramDetails
  benchmarkResponsesScores: BenchmarkResponsesScore
}

type BenchmarkResponsesOrgDetail {
  orgName: String
  orgIndustry: OrgIndustry
  orgFinanceSubIndustry: OrgFinanceSubIndustry
  orgSize: OrgSize
  orgHQCountry: OrgHQCountry
  orgAreaOfOperation: OrgAreaOfOperation
}

type BenchmarkResponsesProgramDetails {
  programScope: ProgramScope
  programYearsInOperation: ProgramYearsInOperation
  programReportsTo: ProgramReportsTo
  programFundingModel: ProgramFundingModel
  programModelUtilized: YesNo
  programDataScienceEstablished: ProgramDataScienceEstablished
  programOversightOfAnalytics: ProgramOversightOfAnalytics
}

type BenchmarkResponsesScore {
  Question_1: Int
  Question_2: Int
  Question_3: Int
  Question_4: Int
  Question_5: Int
  Question_6: Int
  Question_7: Int
  Question_8: Int
  Question_9: Int
  Question_10: Int
  Question_11: Int
  Question_12: Int
  Question_13: Int
  Question_14: Int
  Question_15: Int
  Question_16: Int
  Question_17: Int
  Question_18: Int
  Question_19: Int
  Question_20: Int
  Question_21: Int
  Question_22: Int
  Question_23: Int
  Question_24: Int
}

I want to write a UDF that will return the average of each score across the responses split by various fields. I then want to use this UDF as a custom resolver for a query. Below is the GraphQL schema for what I want the GraphQl response to look like.

type BenchmarkDetails{
  orgIndustry: OrgIndustry
  orgFinanceSubIndustry: OrgFinanceSubIndustry
  orgHQCountry:OrgHQCountry
  orgAreaOfOperation: OrgAreaOfOperation
  orgSize: OrgSize
  programDataScienceEstablished: ProgramDataScienceEstablished
  programOversightOfAnalytics: ProgramOversightOfAnalytics
  programYearsInOperation: ProgramYearsInOperation
  programModelUtilized: YesNo
  programFundingModel: ProgramFundingModel
  programReportsTo: ProgramReportsTo
  programScope: String
  AVG_Question_1: Float
  AVG_Question_2: Float
  ...
  AVG_Question_24: Float
}

I’ve tried various different things to do this over the last couple of days. To be honest I;ve just gotten myself tied up in a series of Map, Query, Let and Lambda statements. I was able to create a query to join the nested documents. I was also able to write a query to aggregate a single question score split by one of the categories. I haven’t been able to get even close to bringing this together into one query.

The SQL equivalent of what I am trying to do is below.

select 
a.orgIndustry,
a.orgFinanceSubIndustry,
...
b. programDataScienceEstablished,
b.programOversightOfAnalytics,
...
AVG(c.Question_1),
AVG(c.Question_2),
...
from a join b join c
group by 
a.orgIndustry,
a.orgFinanceSubIndustry,
...
b. programDataScienceEstablished,
b.programOversightOfAnalytics,
...

This the function that I tried to use to consolidate the documents but I couldn’t get further than this.

CreateFunction({
  name: "allbenchmarks",
  body: Query(Lambda(["size", "afterCursor", "beforeCursor"],
    Map(Paginate(Match(Index("allBenchmarkResponses"))), Lambda("bmRef", Let({
        responseDoc: Get(Var("bmRef")),
        orgRef: Select(
            ["data", "benchmarkResponsesOrgDetails"], Var("responseDoc")),
        orgDoc: Get(Var("orgRef")),
        progRef: Select(
            ["data", "benchmarkResponsesProgramDetails"], Var("responseDoc")),
        progDoc: Get(Var("progRef")),
        scoreRef: Select(
            ["data", "benchmarkResponsesScores"], Var("responseDoc")),
        scoreDoc: Get(Var("scoreRef"))
    }, {
        orgIndustry: Select(["data", "orgIndustry"], Var("orgDoc")),
        orgFinanceSubIndustry: Select(
            ["data", "orgFinanceSubIndustry"], Var("orgDoc")),
        orgHQCountry: Select(["data", "orgHQCountry"], Var("orgDoc")),
        orgAreaOfOperation: Select(
            ["data", "orgAreaOfOperation"], Var("orgDoc")),
        orgSize: Select(["data", "orgSize"], Var("orgDoc")),
        programDataScienceEstablished: Select(
            ["data", "programDataScienceEstablished"], Var("progDoc")),
        programOversightOfAnalytics: Select(
            ["data", "programOversightOfAnalytics"], Var("progDoc")),
        programYearsInOperation: Select(
            ["data", "programYearsInOperation"], Var("progDoc")),
        programModelUtilized: Select(
            ["data", "programModelUtilized"], Var("progDoc")),
        programFundingModel: Select(
            ["data", "programFundingModel"], Var("progDoc")),
        programReportsTo: Select(["data", "programReportsTo"], Var("progDoc")),
        programScope: Select(["data", "programScope"], Var("progDoc")),
        DCAM_BM_2019_1: Select(["data", "DCAM_Question_1"], Var("scoreDoc"))
    })))
  ))
})

Here is a sample document (with a few extra docs).

{
  "ref": Ref(Collection("BenchmarkResponse"), "287396098850423304"),
  "ts": 1610341128140000,
  "data": {
    "benchmarkResponsesScores": Ref(Collection("BenchmarkResponsesScore"), "287396098842037768"),
    "benchmarkResponsesProgramDetails": Ref(Collection("BenchmarkResponsesProgramDetails"), "287396098842034696"),
    "benchmarkResponsesOrgDetails": Ref(Collection("BenchmarkResponsesOrgDetail"), "287396098842036744"),
  }
}

{
  "ref": Ref(Collection("BenchmarkResponsesScore"), "287396098842037768"),
  "ts": 1610341128140000,
  "data": {
    "Question_3": 3,
    "Question_20": 4,
    "Question_11": 3,
    "Question_6": 4,
    "Question_10": 4,
    "Question_15": 4,
    "Question_9": 3,
    "Question_2": 3,
    "Question_21": 4,
    "Question_24": 3,
    "Question_5": 4,
    "Question_14": 3,
    "Question_23": 3,
    "Question_18": 3,
    "Question_13": 4,
    "Question_8": 5,
    "Question_17": 4,
    "Question_22": 4,
    "Question_12": 4,
    "Question_7": 3,
    "Question_4": 2,
    "Question_19": 5,
    "Question_16": 3,
    "Question_1": 3
  }
}

{
  "ref": Ref(Collection("BenchmarkResponsesProgramDetails"), "287396098842034696"),
  "ts": 1610341128140000,
  "data": {
    "programDataScienceEstablished": "In_Process_of_Being_Established",
    "programOversightOfAnalytics": "Partial",
    "programYearsInOperation": "A_Less_Than_One_Year",
    "programModelUtilized": "No",
    "programFundingModel": "Centralized",
    "programReportsTo": "COO",
    "programScope": "Global_or_Enterprise_Wide"
  }
}

{
  "ref": Ref(Collection("BenchmarkResponsesOrgDetail"), "287396098842036744"),
  "ts": 1610341128140000,
  "data": {
    "orgName": "Test",
    "orgIndustry": "Finance_Insurance_and_Banking",
    "orgAreaOfOperation": "Regional",
    "orgHQCountry": "Albania",
    "orgSize": "Tier_2",
    "orgFinanceSubIndustry": "Insurance"
  }
}

Thanks for reading and the help if you can offer it.

Best
Michael

Hey Michael.

Sorry for the late response. You are doing a very analytical type of query (OLAP) on a database that is primarily meant for OLTP queries. One day we might be the greatest solution there exists in OLAP, but that’s not the case today :upside_down_face:.

Start with getting averages fro a specific combination of terms (orgNAme, orgIndusttry, programScope,…)

Although you can technically run all this in one query (but might run into pagesize limits or a query that runs too long) I would personally split it up in multiple queries since I do assume that ACID is less relevant here?

Regardless of whether you start from here or do the query these separately, this is what you you are missing:

Create indexes:

Create an index on program details and on program org details If you always filter on everything, you are in luck and can simply list all the terms in both index. I would advise you to start off like that and gradually increase the complexity of your query once you know that works.

If you sometimes filter only on a few of these you will have to create multiple indexes and multiple UDFs and/or combine multiple index matches in one FQL query such as explained here: (How to query by multiple conditions in faunadb? - Stack Overflow). Which option you choose depends on the tradeoffs.

Match on both indexes, join to go to BenchmarkResponse reference, then intersect those references.

Assuming you choose the easy solution here, you then want to match on both indexes and intersect them. Very similar to the most advanced approach explained in the stackoverflow post I linked above except that you have an extra difficulty. You are filtering on two different collections, intersection between those would not work since you need the same values (e.g. references) to be able to intersect. No problem though, you have a common related document (BenchmarkResponse). So once you have filtered both collections, you would then Join both these results to replace the references of these documents (filtered OrgDetail and filtered ProgramDetails) with the references of the BenchmarkResponse collection they relate to.

Aggregate.

Now you should have the BenchmarkResponse references that correspond to your mult-conditional query (of orgNAme, orgIndusttry, programScope,…)
You now need to aggregate these for which you need to Map over them and Get them.

Manage the complexity.

This query is going to become fairly complex, my apologies that I can’t help you write it, I only have a 8 hours in a day, else I’d gladly help :slight_smile:. However, I did want to give you the advice to start splitting up your queries into chunks to keep it readable for yourself, it might help a lot.

Define a few custom UDFs to help you abstract code.

You could split it up in either different FQL functions which you call with parameters (look here for inspiration Getting started with FQL, Fauna’s native query language - part 4) to keep this manageable and easy to test.

Use the host language.

Or define that query in a driver language, use the host language to compose the query and send it as one chunk to Fauna to create a function. The Stackoverflow example already provides an example for that (look for DropAllButRef). Eigils library is also a great example of that (GitHub - shiftx/faunadb-fql-lib).
Below is one example of how I write my auth logic in JavaScript by composing FQL which might give you inspiration on how to keep it manageable. All below functions are just snippets of FQL composed together in login/logout/refresh UDFs.

export function CreateAccessToken(accountRef, refreshTokenRef, ttlSeconds) {
    return Create(Tokens(), {
        instance: accountRef,
        // A  token is a document just like everything else in Fauna.
        // We will store extra metadata on the token to identify the token type.
        data: {
            type: 'access',
            // We store which refresh token that created the access tokens which allows us to easily invalidate
            // all access tokens created by a specific refresh token.
            refresh: refreshTokenRef
        },
        // access tokens live for 10 minutes, which is typically a good livetime for short-lived tokens.
        ttl: TimeAdd(Now(), ttlSeconds || ACCESS_TOKEN_LIFETIME_SECONDS, 'seconds')
    })
}

export function CreateRefreshToken(accountRef, ttlSeconds) {
    return Create(Tokens(), {
        instance: accountRef,
        data: {
            type: 'refresh',
            used: false,
            sessionId: CreateOrReuseId()
        },
        // 8 hours is a good time for refresh tokens.
        ttl: TimeAdd(Now(), ttlSeconds || REFRESH_TOKEN_LIFETIME_SECONDS, 'seconds'),
    })
}

function CreateOrReuseId() {
    return If(
        IsCalledWithRefreshToken(),
        GetSessionId(),
        NewId()
    )
}

export function GetSessionId() {
    return Select(['data', 'sessionId'], Get(CurrentToken()))
}


export function CreateAccessAndRefreshToken(instance, accessTtlSeconds, refreshTtlSeconds) {
    return Let(
        {
            refresh: CreateRefreshToken(instance, refreshTtlSeconds),
            access: CreateAccessToken(instance, Select(['ref'], Var('refresh')), accessTtlSeconds)
        },
        {
            refresh: Var('refresh'),
            access: Var('access')
        }
    )
}

export function InvalidateAccessAndRefreshToken(refreshTokenRef) {
    return Do(
        InvalidateAccessToken(refreshTokenRef),
        InvalidateRefreshToken(refreshTokenRef)
    )
}

function InvalidateRefreshToken(refreshTokenRef) {
    return Update(refreshTokenRef, { data: { used: true, timeUsed: Now() } })
}

function InvalidateAccessToken(refreshTokenRef) {
    return If(
        Exists(Match(Index('access_token_by_refresh_token'), refreshTokenRef)),
        Delete(Select(['ref'], Get(Match(Index('access_token_by_refresh_token'), refreshTokenRef)))),
        false
    )
}

export function LogoutAccessAndRefreshToken(refreshTokenRef) {
    return Do(
        InvalidateAccessToken(refreshTokenRef),
        Delete(refreshTokenRef)
    )
}

export function IsCalledWithAccessToken() {
    return Equals(Select(['data', 'type'], Get(CurrentToken()), false), 'access')
}

export function IsCalledWithRefreshToken() {
    return And(
        HasCurrentToken(),
        Equals(Select(['data', 'type'], Get(CurrentToken()), false), 'refresh')
    )
}```