java.lang.Object

org.rumbledb.runtime.flwor.FlworDataFrameUtils

public class FlworDataFrameUtils extends Object

Field Summary

Fields

Modifier and Type

Field

Description

static String

backtickEscape
Constructor Summary

Constructors

Constructor

Description

FlworDataFrameUtils()
Method Summary

Modifier and Type

Method

Description

static String

createTempView(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df)

static void

deserializeWrappedParameters(scala.collection.immutable.ArraySeq<byte[]> wrappedParameters, List<List<Item>> deserializedParams, com.esotericsoftware.kryo.Kryo kryo, com.esotericsoftware.kryo.io.Input input)

static org.apache.spark.sql.types.StructType

escapeSchema(org.apache.spark.sql.types.StructType schema, boolean inverse)

recursevely escape/de-escape backticks from all fields in a sparkSql schema

static List<FlworDataFrameColumn>

getColumns(org.apache.spark.sql.types.StructType inputSchema)

static List<FlworDataFrameColumn>

getColumns(org.apache.spark.sql.types.StructType inputSchema, Map.Entry<Name,DynamicContext.VariableDependency> dependencies)

Lists the names of the columns of the schema that needed by the dependencies.

static void

getColumns(org.apache.spark.sql.types.StructType inputSchema, Map.Entry<Name,DynamicContext.VariableDependency> dependency, List<Name> variablesToRestrictTo, List<Name> variablesToExclude, List<FlworDataFrameColumn> result)

Lists the names of the columns of the schema that needed by the dependencies, but except duplicates (which are overriden).

static List<FlworDataFrameColumn>

getColumns(org.apache.spark.sql.types.StructType inputSchema, Map<Name,DynamicContext.VariableDependency> dependencies)

Lists the names of the columns of the schema that needed by the dependencies.

static List<FlworDataFrameColumn>

getColumns(org.apache.spark.sql.types.StructType inputSchema, Map<Name,DynamicContext.VariableDependency> dependencies, List<Name> variablesToRestrictTo, List<Name> variablesToExclude)

Lists the names of the columns of the schema that needed by the dependencies, but except duplicates (which are overriden).

static long

getCountOfField(org.apache.spark.sql.Row row, int columnIndex)

static String

getGroupBySQLProjection(org.apache.spark.sql.types.StructType inputSchema, int duplicateVariableIndex, boolean trailingComma, String serializerUdfName, List<Name> groupbyVariableNames, Map<Name,DynamicContext.VariableDependency> dependencies)

Prepares a SQL projection for use in a GROUP BY query.

static String

getSQLColumnProjection(List<FlworDataFrameColumn> columnNames, boolean trailingComma)

Prepares a SQL projection from the specified column names.

static String

getSQLProjection(List<String> columnNames, boolean trailingComma)

Prepares a SQL projection from the specified column names.

static String

getUDFParametersFromColumns(List<FlworDataFrameColumn> columnNames)

Prepares the parameters supplied to a UDF, as a row obtained from the specified attributes.

static boolean

hasColumnForVariable(org.apache.spark.sql.types.StructType inputSchema, Name variable)

static boolean

isNativeSequence(org.apache.spark.sql.types.StructType schema, String columnName)

static boolean

isVariableAvailableAsCountOnly(org.apache.spark.sql.types.StructType inputSchema, Name variable)

Checks if the specified variable only has a count in a DataFrame with the supplied schema.

static boolean

isVariableAvailableAsNativeItem(org.apache.spark.sql.types.StructType inputSchema, Name variable)

Checks if the specified variable is available as a single native item in a DataFrame with the supplied schema.

static boolean

isVariableAvailableAsNativeSequence(org.apache.spark.sql.types.StructType inputSchema, Name variable)

Checks if the specified variable is available as a native sequence of items in a DataFrame with the supplied schema.

static boolean

isVariableAvailableAsSerializedSequence(org.apache.spark.sql.types.StructType inputSchema, Name variable)

Checks if the specified variable is available as a serialized sequence of items in a DataFrame with the supplied schema.

static org.apache.spark.sql.types.DataType

nativeTypeOfVariable(org.apache.spark.sql.types.StructType inputSchema, Name variable)

If the variable is available as a single native item, returns its native SQL data type.

static org.apache.spark.sql.types.StructField[]

recursiveRename(org.apache.spark.sql.types.StructType schema, boolean inverse)

static void

registerKryoClassesKryo(com.esotericsoftware.kryo.Kryo kryo)

static org.apache.spark.sql.Row

reserializeRowWithNewData(org.apache.spark.sql.Row prevRow, List<Item> newColumn, int duplicateColumnIndex, com.esotericsoftware.kryo.Kryo kryo, com.esotericsoftware.kryo.io.Output output)

static org.apache.spark.sql.types.StructType

schemaUnion(org.apache.spark.sql.types.StructType leftSchema, org.apache.spark.sql.types.StructType rightSchema)

static byte[]

serializeItem(Item toSerialize, com.esotericsoftware.kryo.Kryo kryo, com.esotericsoftware.kryo.io.Output output)

static byte[]

serializeItemList(List<Item> toSerialize, com.esotericsoftware.kryo.Kryo kryo, com.esotericsoftware.kryo.io.Output output)

static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row>

zipWithIndex(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, Long offset, String indexName)

Algorithm taken from following link and adapted to Java with minor improvements.

static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row>

zipWithIndex(JSoundDataFrame jdf, Long offset)

Zips a JSoundDataFrame to a special column.

Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait

Field Details
- backtickEscape
  
  public static String backtickEscape
Constructor Details
- FlworDataFrameUtils
  
  public FlworDataFrameUtils()
Method Details
- registerKryoClassesKryo
  
  public static void registerKryoClassesKryo(com.esotericsoftware.kryo.Kryo kryo)
- serializeItem
  
  public static byte[] serializeItem(Item toSerialize, com.esotericsoftware.kryo.Kryo kryo, com.esotericsoftware.kryo.io.Output output)
- serializeItemList
  
  public static byte[] serializeItemList(List<Item> toSerialize, com.esotericsoftware.kryo.Kryo kryo, com.esotericsoftware.kryo.io.Output output)
- getColumns
  
  public static List<FlworDataFrameColumn> getColumns(org.apache.spark.sql.types.StructType inputSchema)
  
  Parameters:
  
  inputSchema - schema specifies the columns to be used in the query
  
  Returns:
  
  list of FLWOR columns in the schema
- hasColumnForVariable
  
  public static boolean hasColumnForVariable(org.apache.spark.sql.types.StructType inputSchema, Name variable)
  
  Parameters:
  
  inputSchema - schema specifies the columns to be used in the query
  
  variable - the Name fo a variable
  
  Returns:
  
  true if the schema contains values for this variable.
- isVariableAvailableAsCountOnly
  
  public static boolean isVariableAvailableAsCountOnly(org.apache.spark.sql.types.StructType inputSchema, Name variable)
  
  Checks if the specified variable only has a count in a DataFrame with the supplied schema.
  
  Parameters:
  
  inputSchema - schema specifies the columns to be used in the query.
  
  variable - the name of the variable.
  
  Returns:
  
  true if it only has a count, false otherwise.
- isVariableAvailableAsNativeSequence
  
  public static boolean isVariableAvailableAsNativeSequence(org.apache.spark.sql.types.StructType inputSchema, Name variable)
  
  Checks if the specified variable is available as a native sequence of items in a DataFrame with the supplied schema.
  
  Parameters:
  
  inputSchema - schema specifies the columns to be used in the query.
  
  variable - the name of the variable.
  
  Returns:
  
  true if it is available as a native sequence of items, false otherwise.
- isVariableAvailableAsSerializedSequence
  
  public static boolean isVariableAvailableAsSerializedSequence(org.apache.spark.sql.types.StructType inputSchema, Name variable)
  
  Checks if the specified variable is available as a serialized sequence of items in a DataFrame with the supplied schema.
  
  Parameters:
  
  inputSchema - schema specifies the columns to be used in the query.
  
  variable - the name of the variable.
  
  Returns:
  
  true if it is available as a serialized sequence of items, false otherwise.
- nativeTypeOfVariable
  
  public static org.apache.spark.sql.types.DataType nativeTypeOfVariable(org.apache.spark.sql.types.StructType inputSchema, Name variable)
  
  If the variable is available as a single native item, returns its native SQL data type.
  
  Parameters:
  
  inputSchema - schema specifies the columns to be used in the query.
  
  variable - the name of the variable.
  
  Returns:
  
  the native SQL data type of the variable.
- isVariableAvailableAsNativeItem
  
  public static boolean isVariableAvailableAsNativeItem(org.apache.spark.sql.types.StructType inputSchema, Name variable)
  
  Checks if the specified variable is available as a single native item in a DataFrame with the supplied schema.
  
  Parameters:
  
  inputSchema - schema specifies the columns to be used in the query.
  
  variable - the name of the variable.
  
  Returns:
  
  true if it is available as a single native item, false otherwise.
- getColumns
  
  public static List<FlworDataFrameColumn> getColumns(org.apache.spark.sql.types.StructType inputSchema, Map<Name,DynamicContext.VariableDependency> dependencies)
  
  Lists the names of the columns of the schema that needed by the dependencies. Pre-aggregrated counts have .count suffixes and might not exactly match the FLWOR variable name.
  
  Parameters:
  
  inputSchema - schema specifies the columns to be used in the query
  
  dependencies - restriction of the results to within a specified set
  
  Returns:
  
  list of SQL column names in the schema
- getColumns
  
  public static List<FlworDataFrameColumn> getColumns(org.apache.spark.sql.types.StructType inputSchema, Map.Entry<Name,DynamicContext.VariableDependency> dependencies)
  
  Lists the names of the columns of the schema that needed by the dependencies. Pre-aggregrated counts have .count suffixes and might not exactly match the FLWOR variable name.
  
  Parameters:
  
  inputSchema - schema specifies the columns to be used in the query
  
  dependencies - restriction of the results to within a specified set
  
  Returns:
  
  list of SQL column names in the schema
- getColumns
  
  public static List<FlworDataFrameColumn> getColumns(org.apache.spark.sql.types.StructType inputSchema, Map<Name,DynamicContext.VariableDependency> dependencies, List<Name> variablesToRestrictTo, List<Name> variablesToExclude)
  
  Lists the names of the columns of the schema that needed by the dependencies, but except duplicates (which are overriden).
  
  Parameters:
  
  inputSchema - schema specifies the type information for all input columns (included those not needed).
  
  dependencies - restriction of the results to within a specified set
  
  variablesToRestrictTo - variables whose columns must refer to.
  
  variablesToExclude - variables whose columns should be projected away.
  
  Returns:
  
  list of SQL column names in the schema
- getColumns
  
  public static void getColumns(org.apache.spark.sql.types.StructType inputSchema, Map.Entry<Name,DynamicContext.VariableDependency> dependency, List<Name> variablesToRestrictTo, List<Name> variablesToExclude, List<FlworDataFrameColumn> result)
  
  Lists the names of the columns of the schema that needed by the dependencies, but except duplicates (which are overriden).
  
  Parameters:
  
  inputSchema - schema specifies the type information for all input columns (included those not needed).
  
  dependency - the one variable dependency to look for
  
  variablesToRestrictTo - variables whose columns must refer to.
  
  variablesToExclude - variables whose columns should be projected away.
  
  result - the list for outputting SQL column names in the schema
- getUDFParametersFromColumns
  
  public static String getUDFParametersFromColumns(List<FlworDataFrameColumn> columnNames)
  
  Prepares the parameters supplied to a UDF, as a row obtained from the specified attributes.
  
  Parameters:
  
  columnNames - the names of the columns to pass as a parameter.
  
  Returns:
  
  The parameters expressed in SQL.
- getSQLProjection
  
  public static String getSQLProjection(List<String> columnNames, boolean trailingComma)
  
  Prepares a SQL projection from the specified column names. Not for use in FLWOR DataFrames! Only for native storage of sequences of objects.
  
  Parameters:
  
  columnNames - schema specifies the columns to be used in the query
  
  trailingComma - boolean field to have a trailing comma
  
  Returns:
  
  comma separated variables to be used in spark SQL
- getSQLColumnProjection
  
  public static String getSQLColumnProjection(List<FlworDataFrameColumn> columnNames, boolean trailingComma)
  
  Prepares a SQL projection from the specified column names.
  
  Parameters:
  
  columnNames - schema specifies the columns to be used in the query
  
  trailingComma - boolean field to have a trailing comma
  
  Returns:
  
  comma separated variables to be used in spark SQL
- recursiveRename
  
  public static org.apache.spark.sql.types.StructField[] recursiveRename(org.apache.spark.sql.types.StructType schema, boolean inverse)
- escapeSchema
  
  public static org.apache.spark.sql.types.StructType escapeSchema(org.apache.spark.sql.types.StructType schema, boolean inverse)
  
  recursevely escape/de-escape backticks from all fields in a sparkSql schema
  
  Parameters:
  
  schema - schema to escape
  
  inverse - if true, perform de-escaping, otherwise escape
  
  Returns:
  
  the new schema appropriately escaped/de-escaped
- getGroupBySQLProjection
  
  public static String getGroupBySQLProjection(org.apache.spark.sql.types.StructType inputSchema, int duplicateVariableIndex, boolean trailingComma, String serializerUdfName, List<Name> groupbyVariableNames, Map<Name,DynamicContext.VariableDependency> dependencies)
  
  Prepares a SQL projection for use in a GROUP BY query.
  
  Parameters:
  
  inputSchema - schema specifies the type information for all input columns (included those not needed).
  
  duplicateVariableIndex - enables skipping a variable
  
  trailingComma - field to have a trailing comma
  
  serializerUdfName - name of the serializer function
  
  groupbyVariableNames - names of group by variables
  
  dependencies - variable dependencies of the group by clause
  
  Returns:
  
  comma separated variables to be used in spark SQL
- isNativeSequence
  
  public static boolean isNativeSequence(org.apache.spark.sql.types.StructType schema, String columnName)
- deserializeWrappedParameters
  
  public static void deserializeWrappedParameters(scala.collection.immutable.ArraySeq<byte[]> wrappedParameters, List<List<Item>> deserializedParams, com.esotericsoftware.kryo.Kryo kryo, com.esotericsoftware.kryo.io.Input input)
- reserializeRowWithNewData
  
  public static org.apache.spark.sql.Row reserializeRowWithNewData(org.apache.spark.sql.Row prevRow, List<Item> newColumn, int duplicateColumnIndex, com.esotericsoftware.kryo.Kryo kryo, com.esotericsoftware.kryo.io.Output output)
- getCountOfField
  
  public static long getCountOfField(org.apache.spark.sql.Row row, int columnIndex)
- zipWithIndex
  
  public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> zipWithIndex(JSoundDataFrame jdf, Long offset)
  
  Zips a JSoundDataFrame to a special column.
  
  Parameters:
  
  jdf - - the JSoundDataframe to perform the operation on
  
  offset - - starting offset for the first index
  
  Returns:
  
  returns JSoundDataFrame with the added column containing indices (with some specific UUID)
- zipWithIndex
  
  public static org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> zipWithIndex(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df, Long offset, String indexName)
  
  Algorithm taken from following link and adapted to Java with minor improvements. https://stackoverflow.com/a/48454000/10707488
  
  Parameters:
  
  df - - df to perform the operation on
  
  offset - - starting offset for the first index
  
  indexName - - name of the index column
  
  Returns:
  
  returns DataFrame with the added 'indexName' column containing indices
- schemaUnion
  
  public static org.apache.spark.sql.types.StructType schemaUnion(org.apache.spark.sql.types.StructType leftSchema, org.apache.spark.sql.types.StructType rightSchema)
- createTempView
  
  public static String createTempView(org.apache.spark.sql.Dataset<org.apache.spark.sql.Row> df)

Class FlworDataFrameUtils

Field Summary

Constructor Summary

Method Summary

Methods inherited from class java.lang.Object

Field Details

backtickEscape

Constructor Details

FlworDataFrameUtils

Method Details

registerKryoClassesKryo

serializeItem

serializeItemList

getColumns

hasColumnForVariable

isVariableAvailableAsCountOnly

isVariableAvailableAsNativeSequence

isVariableAvailableAsSerializedSequence

nativeTypeOfVariable

isVariableAvailableAsNativeItem

getColumns

getColumns

getColumns

getColumns

getUDFParametersFromColumns

getSQLProjection

getSQLColumnProjection

recursiveRename

escapeSchema

getGroupBySQLProjection

isNativeSequence

deserializeWrappedParameters

reserializeRowWithNewData

getCountOfField

zipWithIndex

zipWithIndex

schemaUnion

createTempView