Data Assembly¶

Overview¶

Data Assembly in AIDDDMAP is primarily handled through the Interactive Data Assembly Tool (IDAT), a sophisticated visual workspace that allows users to create, modify, and manage data workflows. The system combines drag-and-drop functionality with AI agent assistance to streamline the data preparation and publishing process.

IDAT Canvas¶

Architecture¶

interface IDATPanelConfig {
  dimensions: {
    width: number;
    height: number;
  };
  grid: {
    size: number;
    snap: boolean;
  };
  theme: {
    background: string;
    gridColor: string;
    nodeColors: Record<string, string>;
  };
}

interface IDATPanelState {
  nodes: Node[];
  edges: Edge[];
  selectedElements: (Node | Edge)[];
  mode: "select" | "connect" | "pan";
}

Node Types¶

interface BaseNode {
  id: string;
  type: NodeType;
  position: { x: number; y: number };
  data: any;
}

enum NodeType {
  DATASET = "dataset",
  AGENT = "agent",
  TRANSFORM = "transform",
  OUTPUT = "output",
}

// Dataset Node
interface DatasetNode extends BaseNode {
  type: NodeType.DATASET;
  data: {
    name: string;
    schema: SchemaDefinition;
    preview: DataPreview;
    metadata: DatasetMetadata;
  };
}

// Agent Node
interface AgentNode extends BaseNode {
  type: NodeType.AGENT;
  data: {
    agent: BaseAgent;
    status: AgentStatus;
    configuration: AgentConfig;
  };
}

Workflow Creation¶

1. Visual Assembly¶

// Example of creating a workflow
const workflow = new DataWorkflow({
  name: "Health Data Processing",
  description: "Process and analyze health metrics",
});

// Add nodes
workflow.addNode({
  type: NodeType.DATASET,
  data: {
    name: "Raw Health Data",
    source: "fitness_tracker",
  },
});

workflow.addNode({
  type: NodeType.AGENT,
  data: {
    agent: new DataCuratorAgent(),
    configuration: {
      cleansingRules: ["remove_nulls", "standardize_dates"],
    },
  },
});

2. Node Connections¶

interface Edge {
  id: string;
  source: string;
  target: string;
  type: EdgeType;
  data?: {
    transformation?: TransformationRule[];
    validation?: ValidationRule[];
  };
}

enum EdgeType {
  DATA_FLOW = "data_flow",
  CONTROL_FLOW = "control_flow",
  FEEDBACK = "feedback",
}

3. Data Transformations¶

interface TransformationRule {
  type: "map" | "filter" | "reduce" | "aggregate";
  config: {
    input: string[];
    output: string[];
    operation: string;
    parameters: Record<string, any>;
  };
}

// Example transformation
const transformation = {
  type: "map",
  config: {
    input: ["heart_rate", "timestamp"],
    output: ["avg_heart_rate"],
    operation: "moving_average",
    parameters: {
      window_size: 5,
    },
  },
};

Real-time Processing¶

1. Live Data Handling¶

interface LiveDataConfig {
  source: DataSource;
  bufferSize: number;
  updateInterval: number;
  errorHandling: ErrorHandlingStrategy;
}

class LiveDataHandler {
  constructor(config: LiveDataConfig) {
    this.buffer = new CircularBuffer(config.bufferSize);
    this.interval = config.updateInterval;
  }

  async processIncomingData(data: any): Promise<void> {
    await this.buffer.add(data);
    await this.notifySubscribers(data);
  }
}

2. State Management¶

interface WorkflowState {
  status: "idle" | "processing" | "error";
  progress: number;
  activeNodes: string[];
  errors: WorkflowError[];
  metrics: {
    processedItems: number;
    throughput: number;
    latency: number;
  };
}

Integration with AI Agents¶

1. Agent Deployment¶

// Deploy an agent to the workflow
const agentNode = await workflow.deployAgent({
  agent: new DataCuratorAgent(),
  position: { x: 300, y: 200 },
  configuration: {
    mode: "automatic",
    triggers: ["new_data", "schema_change"],
  },
});

2. Agent Communication¶

interface AgentMessage {
  type: "command" | "status" | "result";
  payload: any;
  timestamp: Date;
}

// Example of agent interaction
agentNode.on("data", async (data: any) => {
  const result = await agentNode.agent.process(data);
  workflow.updateNode(agentNode.id, { result });
});

Data Quality & Validation¶

1. Schema Validation¶

interface SchemaValidation {
  rules: ValidationRule[];
  onError: (error: ValidationError) => void;
  autoFix: boolean;
}

interface ValidationRule {
  field: string;
  type: "required" | "format" | "range" | "custom";
  parameters: any;
}

2. Quality Metrics¶

interface QualityMetrics {
  completeness: number;
  accuracy: number;
  consistency: number;
  timeliness: number;
  custom: Record<string, number>;
}

Best Practices¶

1. Performance Optimization¶

// Implement data streaming for large datasets
const stream = new DataStream({
  batchSize: 1000,
  parallel: true,
  maxConcurrency: 4,
});

stream.on("data", async (batch) => {
  await workflow.processBatch(batch);
});

2. Error Handling¶

try {
  await workflow.execute();
} catch (error) {
  if (error instanceof ValidationError) {
    await workflow.handleValidationError(error);
  } else if (error instanceof ProcessingError) {
    await workflow.retry({
      maxAttempts: 3,
      backoff: "exponential",
    });
  }
}

3. Resource Management¶

const resources = new ResourceManager({
  maxMemory: "4GB",
  maxConcurrency: 8,
  timeout: 30000,
});

workflow.setResourceConstraints(resources);

Example Workflows¶

1. Data Preparation Pipeline¶

const pipeline = new DataPipeline()
  .addSource(new DatasetNode("raw_data"))
  .addTransformation(new CleansingNode())
  .addAgent(new DataCuratorAgent())
  .addValidation(new QualityCheckNode())
  .addDestination(new OutputNode("processed_data"));

await pipeline.execute();

2. Real-time Analytics¶

const realtimeWorkflow = new RealTimeWorkflow()
  .setSource(
    new StreamingSource({
      type: "websocket",
      url: "ws://data-stream",
    }),
  )
  .addProcessing(
    new WindowedAggregation({
      window: "5m",
      operation: "average",
    }),
  )
  .addVisualization(new LiveDashboard());