The Art of System Design: From Requirements to Architecture
System design is both an art and a science. It requires technical knowledge, creativity, and the ability to make trade-offs. Let's explore how to master this essential skill.
Understanding Requirements
1. Functional Requirements
Start with clear, measurable objectives:
2. Non-Functional Requirements
Consider the quality attributes:
performance:
latency:
p95: < 100ms
p99: < 200ms
throughput: 10000 RPS
availability:
uptime: 99.99%
recovery_time: < 5 minutes
scalability:
users: 1M concurrent
data: 5TB/day
security:
authentication: OAuth 2.0
encryption: AES-256
compliance: SOC2, GDPR
System Architecture Patterns
1. Microservices Architecture
Example service implementation:
// user-service/src/domain/User.ts
interface User {
id: string;
email: string;
name: string;
preferences: UserPreferences;
createdAt: Date;
updatedAt: Date;
}
interface UserPreferences {
notifications: boolean;
theme: 'light' | 'dark';
language: string;
}
// user-service/src/services/UserService.ts
class UserService {
constructor(
private readonly userRepository: UserRepository,
private readonly eventBus: EventBus
) {}
async createUser(userData: CreateUserDTO): Promise<User> {
// Validate input
this.validateUserData(userData);
// Check for existing user
const existing = await this.userRepository.findByEmail(userData.email);
if (existing) {
throw new DuplicateUserError(userData.email);
}
// Create user
const user = await this.userRepository.create({
...userData,
id: generateUUID(),
createdAt: new Date(),
updatedAt: new Date()
});
// Publish event
await this.eventBus.publish('USER_CREATED', {
userId: user.id,
email: user.email
});
return user;
}
}
2. Event-Driven Architecture
Implementation example:
// order-service/src/events/OrderEventHandler.ts
class OrderEventHandler {
constructor(
private readonly kafka: KafkaProducer,
private readonly orderRepository: OrderRepository
) {}
async handleOrderCreated(order: Order): Promise<void> {
try {
await this.kafka.produce({
topic: 'order-events',
key: order.id,
value: {
type: 'ORDER_CREATED',
payload: {
orderId: order.id,
userId: order.userId,
amount: order.totalAmount,
items: order.items
},
metadata: {
timestamp: new Date().toISOString(),
version: '1.0'
}
}
});
await this.orderRepository.updateStatus(
order.id,
'PAYMENT_PENDING'
);
} catch (error) {
// Handle error and implement retry logic
await this.handleEventError(error, order);
}
}
}
Scalability Patterns
1. Caching Strategies
// services/CacheService.ts
class CacheService {
constructor(
private readonly redis: Redis,
private readonly fallback: DataSource
) {}
async get<T>(
key: string,
options: {
ttl: number;
staleWhileRevalidate?: boolean;
}
): Promise<T> {
// Try cache first
const cached = await this.redis.get(key);
if (cached) {
return JSON.parse(cached);
}
// Cache miss - get from source
const data = await this.fallback.fetch(key);
// Store in cache
await this.redis.setex(
key,
options.ttl,
JSON.stringify(data)
);
return data;
}
async invalidate(pattern: string): Promise<void> {
const keys = await this.redis.keys(pattern);
if (keys.length > 0) {
await this.redis.del(keys);
}
}
}
2. Load Balancing
# nginx/conf.d/load-balancer.conf
upstream backend {
least_conn; # Least connections algorithm
server backend1.example.com:8080 max_fails=3 fail_timeout=30s;
server backend2.example.com:8080 max_fails=3 fail_timeout=30s;
server backend3.example.com:8080 max_fails=3 fail_timeout=30s;
}
server {
listen 80;
server_name api.example.com;
location / {
proxy_pass http://backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# Enable keepalive
proxy_http_version 1.1;
proxy_set_header Connection "";
# Timeouts
proxy_connect_timeout 60s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
}
}
Data Management
1. Database Sharding
// infrastructure/ShardingStrategy.ts
interface ShardingStrategy {
getShardId(key: string): string;
getAllShards(): string[];
}
class ConsistentHashingStrategy implements ShardingStrategy {
private readonly ring: ConsistentHashRing;
constructor(shards: string[], replicas: number = 3) {
this.ring = new ConsistentHashRing(shards, replicas);
}
getShardId(key: string): string {
return this.ring.getNode(key);
}
getAllShards(): string[] {
return this.ring.getNodes();
}
}
// services/ShardedRepository.ts
class ShardedRepository<T> {
constructor(
private readonly shardingStrategy: ShardingStrategy,
private readonly connectionPool: Map<string, Database>
) {}
async save(key: string, data: T): Promise<void> {
const shardId = this.shardingStrategy.getShardId(key);
const connection = this.connectionPool.get(shardId);
if (!connection) {
throw new ShardNotFoundError(shardId);
}
await connection.save(key, data);
}
async get(key: string): Promise<T | null> {
const shardId = this.shardingStrategy.getShardId(key);
const connection = this.connectionPool.get(shardId);
if (!connection) {
throw new ShardNotFoundError(shardId);
}
return connection.get(key);
}
}
2. CQRS Pattern
// domain/commands/CreateOrder.ts
interface CreateOrderCommand {
userId: string;
items: Array<{
productId: string;
quantity: number;
}>;
shippingAddress: Address;
}
// domain/queries/GetOrderSummary.ts
interface GetOrderSummaryQuery {
orderId: string;
include: Array<'items' | 'shipping' | 'payment'>;
}
// application/CommandBus.ts
class CommandBus {
private handlers = new Map<string, CommandHandler>();
register<T extends Command>(
commandType: string,
handler: CommandHandler<T>
): void {
this.handlers.set(commandType, handler);
}
async execute<T extends Command>(command: T): Promise<void> {
const handler = this.handlers.get(command.constructor.name);
if (!handler) {
throw new HandlerNotFoundError(command);
}
await handler.execute(command);
}
}
// application/QueryBus.ts
class QueryBus {
private handlers = new Map<string, QueryHandler>();
register<T extends Query, R>(
queryType: string,
handler: QueryHandler<T, R>
): void {
this.handlers.set(queryType, handler);
}
async execute<T extends Query, R>(query: T): Promise<R> {
const handler = this.handlers.get(query.constructor.name);
if (!handler) {
throw new HandlerNotFoundError(query);
}
return handler.execute(query);
}
}
Security and Resilience
1. Circuit Breaker Pattern
// infrastructure/CircuitBreaker.ts
class CircuitBreaker {
private state: 'CLOSED' | 'OPEN' | 'HALF_OPEN' = 'CLOSED';
private failureCount = 0;
private lastFailureTime: number | null = null;
constructor(
private readonly threshold: number,
private readonly timeout: number
) {}
async execute<T>(
operation: () => Promise<T>,
fallback?: () => Promise<T>
): Promise<T> {
if (this.state === 'OPEN') {
if (this.shouldRetry()) {
this.state = 'HALF_OPEN';
} else if (fallback) {
return fallback();
} else {
throw new CircuitBreakerOpenError();
}
}
try {
const result = await operation();
this.onSuccess();
return result;
} catch (error) {
return this.onError(error, fallback);
}
}
private onSuccess(): void {
this.failureCount = 0;
this.state = 'CLOSED';
}
private onError<T>(
error: Error,
fallback?: () => Promise<T>
): Promise<T> {
this.failureCount++;
this.lastFailureTime = Date.now();
if (this.failureCount >= this.threshold) {
this.state = 'OPEN';
}
if (fallback) {
return fallback();
}
throw error;
}
private shouldRetry(): boolean {
if (!this.lastFailureTime) return false;
return Date.now() - this.lastFailureTime > this.timeout;
}
}
2. Rate Limiting
// middleware/RateLimiter.ts
class RateLimiter {
constructor(
private readonly redis: Redis,
private readonly options: {
window: number; // Time window in seconds
max: number; // Maximum requests per window
keyPrefix: string;
}
) {}
async isAllowed(key: string): Promise<boolean> {
const now = Date.now();
const windowKey = `${this.options.keyPrefix}:${key}`;
const pipeline = this.redis.pipeline();
// Remove old requests
pipeline.zremrangebyscore(
windowKey,
0,
now - (this.options.window * 1000)
);
// Add current request
pipeline.zadd(windowKey, now, `${now}-${Math.random()}`);
// Count requests in window
pipeline.zcard(windowKey);
// Set expiry
pipeline.expire(windowKey, this.options.window);
const [,, [count]] = await pipeline.exec();
return count <= this.options.max;
}
}
// Example usage in API Gateway
app.use(async (req, res, next) => {
const limiter = new RateLimiter(redis, {
window: 60, // 1 minute
max: 100, // 100 requests per minute
keyPrefix: 'rate-limit'
});
const allowed = await limiter.isAllowed(req.ip);
if (!allowed) {
res.status(429).json({
error: 'Too Many Requests',
retryAfter: 60
});
return;
}
next();
});
Monitoring and Observability
1. Distributed Tracing
// infrastructure/Tracer.ts
interface Span {
id: string;
traceId: string;
parentId?: string;
name: string;
startTime: number;
endTime?: number;
tags: Record<string, string>;
events: Array<{
time: number;
name: string;
attributes?: Record<string, string>;
}>;
}
class Tracer {
constructor(
private readonly exporter: TraceExporter,
private readonly sampler: TraceSampler
) {}
createSpan(
name: string,
options?: {
parent?: Span;
tags?: Record<string, string>;
}
): Span {
const traceId = options?.parent?.traceId ?? generateTraceId();
return {
id: generateSpanId(),
traceId,
parentId: options?.parent?.id,
name,
startTime: Date.now(),
tags: options?.tags ?? {},
events: []
};
}
async endSpan(span: Span): Promise<void> {
span.endTime = Date.now();
if (this.sampler.shouldSample(span)) {
await this.exporter.export(span);
}
}
}
2. Metrics Collection
// monitoring/Metrics.ts
class MetricsCollector {
private metrics: Map<string, Metric> = new Map();
constructor(
private readonly registry: MetricsRegistry,
private readonly labels: Record<string, string> = {}
) {}
counter(name: string, help: string): Counter {
let metric = this.metrics.get(name);
if (!metric) {
metric = new Counter(name, help, this.labels);
this.metrics.set(name, metric);
this.registry.register(metric);
}
return metric as Counter;
}
histogram(
name: string,
help: string,
buckets: number[]
): Histogram {
let metric = this.metrics.get(name);
if (!metric) {
metric = new Histogram(name, help, buckets, this.labels);
this.metrics.set(name, metric);
this.registry.register(metric);
}
return metric as Histogram;
}
gauge(name: string, help: string): Gauge {
let metric = this.metrics.get(name);
if (!metric) {
metric = new Gauge(name, help, this.labels);
this.metrics.set(name, metric);
this.registry.register(metric);
}
return metric as Gauge;
}
}
// Example usage
const metrics = new MetricsCollector(registry, {
service: 'order-service',
environment: 'production'
});
const requestDuration = metrics.histogram(
'http_request_duration_seconds',
'HTTP request duration in seconds',
[0.1, 0.3, 0.5, 0.7, 1, 2, 5]
);
const activeOrders = metrics.gauge(
'active_orders',
'Number of orders currently being processed'
);
app.use(async (req, res, next) => {
const start = Date.now();
res.on('finish', () => {
const duration = (Date.now() - start) / 1000;
requestDuration.observe(duration, {
method: req.method,
path: req.path,
status: res.statusCode.toString()
});
});
next();
});
Conclusion
System design is a journey of continuous learning and improvement. Key takeaways:
-
Start with Requirements
- Understand both functional and non-functional needs
- Document constraints and assumptions
-
Choose Patterns Wisely
- Select architectures that match your scale
- Consider operational complexity
-
Plan for Scale
- Design for horizontal scaling
- Implement caching strategies
- Use appropriate data storage patterns
-
Build for Resilience
- Implement circuit breakers
- Use rate limiting
- Design for failure
-
Monitor Everything
- Implement comprehensive logging
- Use distributed tracing
- Collect and analyze metrics
Remember:
- Keep it simple initially
- Make data-driven decisions
- Document your design choices
- Plan for future scale
- Consider operational aspects
The art of system design is about finding the right balance between complexity and maintainability while meeting both current and future needs.